Re: [PATCH v5 3/3] drivers/vfio: Support EEH error injection

2015-03-31 Thread Alex Williamson
On Thu, 2015-03-26 at 16:42 +1100, Gavin Shan wrote:
> The patch adds one more EEH sub-command (VFIO_EEH_PE_INJECT_ERR)
> to inject the specified EEH error, which is represented by
> (struct vfio_eeh_pe_err), to the indicated PE for testing purpose.
> 
> Signed-off-by: Gavin Shan 
> Reviewed-by: David Gibson 
> ---
>  Documentation/vfio.txt| 12 
>  drivers/vfio/vfio_spapr_eeh.c | 10 ++
>  include/uapi/linux/vfio.h | 14 +-
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 96978ec..4c746a7 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -385,6 +385,18 @@ The code flow from the example above should be slightly 
> changed:
>  
>   
>  
> + /* Inject EEH error, which is expected to be caused by 32-bits
> +  * config load.
> +  */
> + pe_op.op = VFIO_EEH_PE_INJECT_ERR;
> + pe_op.err.type = EEH_ERR_TYPE_32;
> + pe_op.err.func = EEH_ERR_FUNC_LD_CFG_ADDR;
> + pe_op.err.addr = 0ul;
> + pe_op.err.mask = 0ul;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + 
> +
>   /* When 0xFF's returned from reading PCI config space or IO BARs
>* of the PCI device. Check the PE's state to see if that has been
>* frozen.
> diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
> index 5fa42db..38edeb4 100644
> --- a/drivers/vfio/vfio_spapr_eeh.c
> +++ b/drivers/vfio/vfio_spapr_eeh.c
> @@ -85,6 +85,16 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>   case VFIO_EEH_PE_CONFIGURE:
>   ret = eeh_pe_configure(pe);
>   break;
> + case VFIO_EEH_PE_INJECT_ERR:
> + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
> + if (op.argsz < minsz)
> + return -EINVAL;
> + if (copy_from_user(&op, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + ret = eeh_pe_inject_err(pe, op.err.type, op.err.func,
> + op.err.addr, op.err.mask);
> + break;
>   default:
>   ret = -EINVAL;
>   }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 82889c3..d81c17f 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -468,12 +468,23 @@ struct vfio_iommu_spapr_tce_info {
>   * - unfreeze IO/DMA for frozen PE;
>   * - read PE state;
>   * - reset PE;
> - * - configure PE.
> + * - configure PE;
> + * - inject EEH error.
>   */
> +struct vfio_eeh_pe_err {
> + __u32 type;
> + __u32 func;
> + __u64 addr;
> + __u64 mask;
> +};
> +
>  struct vfio_eeh_pe_op {
>   __u32 argsz;
>   __u32 flags;
>   __u32 op;
> + union {
> + struct vfio_eeh_pe_err err;
> + };
>  };
>  
>  #define VFIO_EEH_PE_DISABLE  0   /* Disable EEH functionality */
> @@ -490,6 +501,7 @@ struct vfio_eeh_pe_op {
>  #define VFIO_EEH_PE_RESET_HOT6   /* Assert hot reset 
>  */
>  #define VFIO_EEH_PE_RESET_FUNDAMENTAL7   /* Assert fundamental 
> reset  */
>  #define VFIO_EEH_PE_CONFIGURE    8   /* PE configuration 
>  */
> +#define VFIO_EEH_PE_INJECT_ERR   9   /* Inject EEH error 
>  */
>  
>  #define VFIO_EEH_PE_OP   _IO(VFIO_TYPE, VFIO_BASE + 21)
>  

I assume you want this to go in through the PPC tree, so

Acked-by: Alex Williamson 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 12/31] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-03-31 Thread Alex Williamson
On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> Modern IBM POWERPC systems support multiple (currently two) TCE tables
> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
> for TCE tables. Right now just one table is supported.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  Documentation/vfio.txt  |  23 ++
>  arch/powerpc/include/asm/iommu.h|  18 +++--
>  arch/powerpc/kernel/iommu.c |  34 
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  38 +
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  17 ++--
>  arch/powerpc/platforms/powernv/pci.c|   2 +-
>  arch/powerpc/platforms/powernv/pci.h|   4 +-
>  arch/powerpc/platforms/pseries/iommu.c  |   9 ++-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 120 
> 
>  9 files changed, 183 insertions(+), 82 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 96978ec..94328c8 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -427,6 +427,29 @@ The code flow from the example above should be slightly 
> changed:
>  
>   
>  
> +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
> +VFIO_IOMMU_DISABLE and implements 2 new ioctls:
> +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
> +(which are unsupported in v1 IOMMU).
> +
> +PPC64 paravirtualized guests generate a lot of map/unmap requests,
> +and the handling of those includes pinning/unpinning pages and updating
> +mm::locked_vm counter to make sure we do not exceed the rlimit.
> +The v2 IOMMU splits accounting and pinning into separate operations:
> +
> +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> +receive a user space address and size of the block to be pinned.
> +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
> +be called with the exact address and size used for registering
> +the memory block. The userspace is not expected to call these often.
> +The ranges are stored in a linked list in a VFIO container.
> +
> +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
> +IOMMU table and do not do pinning; instead these check that the userspace
> +address is from pre-registered range.
> +
> +This separation helps in optimizing DMA for guests.
> +
>  
> ---
>  
>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its


How is the above docs change remotely associated with anything else in
this patch?


> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index eb75726..667aa1a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -90,9 +90,7 @@ struct iommu_table {
>   struct iommu_pool pools[IOMMU_NR_POOLS];
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
>   unsigned long  it_page_shift;/* table iommu page size */
> -#ifdef CONFIG_IOMMU_API
> - struct iommu_group *it_group;
> -#endif
> + struct iommu_table_group *it_group;
>   struct iommu_table_ops *it_ops;
>   void (*set_bypass)(struct iommu_table *tbl, bool enable);
>  };
> @@ -126,14 +124,24 @@ extern void iommu_free_table(struct iommu_table *tbl, 
> const char *node_name);
>   */
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>   int nid);
> +
> +#define IOMMU_TABLE_GROUP_MAX_TABLES 1
> +
> +struct iommu_table_group {
>  #ifdef CONFIG_IOMMU_API
> -extern void iommu_register_group(struct iommu_table *tbl,
> + struct iommu_group *group;
> +#endif
> + struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> +};
> +
> +#ifdef CONFIG_IOMMU_API
> +extern void iommu_register_group(struct iommu_table_group *table_group,
>int pci_domain_number, unsigned long pe_num);
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
>  #else
> -static inline void iommu_register_group(struct iommu_table *tbl,
> +static inline void iommu_register_group(struct iommu_table_group 
> *table_group,
>   int pci_domain_number,
>   unsigned long pe_num)
>  {
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index b39d00a..fd49c8e 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -712,17 +712,20 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid)
>  
>  struct iommu_table *iommu_table_alloc(int node)
>  {
> - struct iommu_table *tbl;
> + struct iommu_table_group *table_group;
>  
> - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
> + table_group =

Re: [PATCH kernel v7 28/31] powerpc/mmu: Add userspace-to-physical addresses translation cache

2015-04-01 Thread Alex Williamson
On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> We are adding support for DMA memory pre-registration to be used in
> conjunction with VFIO. The idea is that the userspace which is going to
> run a guest may want to pre-register a user space memory region so
> it all gets pinned once and never goes away. Having this done,
> a hypervisor will not have to pin/unpin pages on every DMA map/unmap
> request. This is going to help with multiple pinning of the same memory
> and in-kernel acceleration of DMA requests.
> 
> This adds a list of memory regions to mm_context_t. Each region consists
> of a header and a list of physical addresses. This adds API to:
> 1. register/unregister memory regions;
> 2. do final cleanup (which puts all pre-registered pages);
> 3. do userspace to physical address translation;
> 4. manage a mapped pages counter; when it is zero, it is safe to
> unregister the region.
> 
> Multiple registration of the same region is allowed, kref is used to
> track the number of registrations.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/mmu-hash64.h  |   3 +
>  arch/powerpc/include/asm/mmu_context.h |  16 +++
>  arch/powerpc/mm/Makefile   |   1 +
>  arch/powerpc/mm/mmu_context_hash64.c   |   6 +
>  arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 
> +
>  5 files changed, 241 insertions(+)
>  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
> 
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
> b/arch/powerpc/include/asm/mmu-hash64.h
> index 4f13c3e..83214c4 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -535,6 +535,9 @@ typedef struct {
>   /* for 4K PTE fragment support */
>   void *pte_frag;
>  #endif
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + struct list_head iommu_group_mem_list;
> +#endif
>  } mm_context_t;
>  
> 
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 73382eb..3461c91 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -16,6 +16,22 @@
>   */
>  extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
>  extern void destroy_context(struct mm_struct *mm);
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> +typedef struct mm_iommu_table_group_mem_t mm_iommu_table_group_mem_t;
> +
> +extern bool mm_iommu_preregistered(void);
> +extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
> + mm_iommu_table_group_mem_t **pmem);
> +extern mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
> + unsigned long entries);
> +extern long mm_iommu_put(mm_iommu_table_group_mem_t *mem);
> +extern void mm_iommu_cleanup(mm_context_t *ctx);
> +extern mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
> + unsigned long size);
> +extern long mm_iommu_ua_to_hpa(mm_iommu_table_group_mem_t *mem,
> + unsigned long ua, unsigned long *hpa);
> +extern long mm_iommu_mapped_update(mm_iommu_table_group_mem_t *mem, bool 
> inc);
> +#endif
>  
>  extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct 
> *next);
>  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index 438dcd3..49fbfc7 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -35,3 +35,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)  += subpage-prot.o
>  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
>  obj-$(CONFIG_HIGHMEM)+= highmem.o
>  obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
> +obj-$(CONFIG_SPAPR_TCE_IOMMU)+= mmu_context_hash64_iommu.o
> diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
> b/arch/powerpc/mm/mmu_context_hash64.c
> index 178876ae..eb3080c 100644
> --- a/arch/powerpc/mm/mmu_context_hash64.c
> +++ b/arch/powerpc/mm/mmu_context_hash64.c
> @@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct 
> mm_struct *mm)
>  #ifdef CONFIG_PPC_64K_PAGES
>   mm->context.pte_frag = NULL;
>  #endif
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
> +#endif
>   return 0;
>  }
>  
> @@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct 
> mm_struct *mm)
>  
>  void destroy_context(struct mm_struct *mm)
>  {
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + mm_iommu_cleanup(&mm->context);
> +#endif
>  
>  #ifdef CONFIG_PPC_ICSWX
>   drop_cop(mm->context.acop, mm);
> diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c 
> b/arch/powerpc/mm/mmu_context_hash64_iommu.c
> new file mode 100644
> index 000..c268c4d
> --- /dev/null
> +++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
> @@ -0,0 +1,215 @@
> +/*
> + *  IOMMU helpers in MMU context.
> + *
> + *  Copyright (C) 2015 IBM Corp. 
> + *
> + *  This program is free software; you can redistribute it and

Re: [PATCH kernel v7 04/31] vfio: powerpc/spapr: Use it_page_size

2015-04-01 Thread Alex Williamson
On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> This makes use of the it_page_size from the iommu_table struct
> as page size can differ.
> 
> This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
> as recently introduced IOMMU_PAGE_XXX macros do not include
> IOMMU_PAGE_SHIFT.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Reviewed-by: David Gibson 
> ---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
>  1 file changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index f835e63..8bbee22 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
>* enforcing the limit based on the max that the guest can map.
>*/
>   down_write(¤t->mm->mmap_sem);
> - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> + npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
>   locked = current->mm->locked_vm + npages;
>   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>   if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> @@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container 
> *container)
>  
>   down_write(¤t->mm->mmap_sem);
>   current->mm->locked_vm -= (container->tbl->it_size <<
> - IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> + container->tbl->it_page_shift) >> PAGE_SHIFT;
>   up_write(¤t->mm->mmap_sem);
>  }
>  
> @@ -222,7 +222,7 @@ static long tce_iommu_build(struct tce_container 
> *container,
>   tce, ret);
>   break;
>   }
> - tce += IOMMU_PAGE_SIZE_4K;
> + tce += IOMMU_PAGE_SIZE(tbl);


Is PAGE_SIZE ever smaller than IOMMU_PAGE_SIZE(tbl)?  IOW, can the page
we got from get_user_pages_fast() ever not completely fill the tce
entry?

(Have I asked this before?  Sorry if so)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 12/31] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-04-01 Thread Alex Williamson
On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> Modern IBM POWERPC systems support multiple (currently two) TCE tables
> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
> for TCE tables. Right now just one table is supported.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  Documentation/vfio.txt  |  23 ++
>  arch/powerpc/include/asm/iommu.h|  18 +++--
>  arch/powerpc/kernel/iommu.c |  34 
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  38 +
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  17 ++--
>  arch/powerpc/platforms/powernv/pci.c|   2 +-
>  arch/powerpc/platforms/powernv/pci.h|   4 +-
>  arch/powerpc/platforms/pseries/iommu.c  |   9 ++-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 120 
> 
>  9 files changed, 183 insertions(+), 82 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 96978ec..94328c8 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -427,6 +427,29 @@ The code flow from the example above should be slightly 
> changed:
>  
>   
>  
> +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
> +VFIO_IOMMU_DISABLE and implements 2 new ioctls:
> +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
> +(which are unsupported in v1 IOMMU).
> +
> +PPC64 paravirtualized guests generate a lot of map/unmap requests,
> +and the handling of those includes pinning/unpinning pages and updating
> +mm::locked_vm counter to make sure we do not exceed the rlimit.
> +The v2 IOMMU splits accounting and pinning into separate operations:
> +
> +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
> +receive a user space address and size of the block to be pinned.
> +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
> +be called with the exact address and size used for registering
> +the memory block. The userspace is not expected to call these often.
> +The ranges are stored in a linked list in a VFIO container.
> +
> +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
> +IOMMU table and do not do pinning; instead these check that the userspace
> +address is from pre-registered range.
> +
> +This separation helps in optimizing DMA for guests.
> +
>  
> ---
>  
>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index eb75726..667aa1a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -90,9 +90,7 @@ struct iommu_table {
>   struct iommu_pool pools[IOMMU_NR_POOLS];
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
>   unsigned long  it_page_shift;/* table iommu page size */
> -#ifdef CONFIG_IOMMU_API
> - struct iommu_group *it_group;
> -#endif
> + struct iommu_table_group *it_group;
>   struct iommu_table_ops *it_ops;
>   void (*set_bypass)(struct iommu_table *tbl, bool enable);
>  };
> @@ -126,14 +124,24 @@ extern void iommu_free_table(struct iommu_table *tbl, 
> const char *node_name);
>   */
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>   int nid);
> +
> +#define IOMMU_TABLE_GROUP_MAX_TABLES 1
> +
> +struct iommu_table_group {
>  #ifdef CONFIG_IOMMU_API
> -extern void iommu_register_group(struct iommu_table *tbl,
> + struct iommu_group *group;
> +#endif
> + struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> +};
> +
> +#ifdef CONFIG_IOMMU_API
> +extern void iommu_register_group(struct iommu_table_group *table_group,
>int pci_domain_number, unsigned long pe_num);
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
>  #else
> -static inline void iommu_register_group(struct iommu_table *tbl,
> +static inline void iommu_register_group(struct iommu_table_group 
> *table_group,
>   int pci_domain_number,
>   unsigned long pe_num)


Not a new problem, but there's some awfully liberal use of the namespace
with function names here.  IOMMU API uses iommu_foo() functions.  IOMMU
group related interfaces within the IOMMU API include "group" somewhere
in that name.  powerpc specific functions should include a tag to avoid
causing conflicts there.

(sorry for commenting twice on the same patch)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 04/31] vfio: powerpc/spapr: Use it_page_size

2015-04-01 Thread Alex Williamson
On Thu, 2015-04-02 at 13:30 +1100, Alexey Kardashevskiy wrote:
> On 04/02/2015 08:48 AM, Alex Williamson wrote:
> > On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> >> This makes use of the it_page_size from the iommu_table struct
> >> as page size can differ.
> >>
> >> This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
> >> as recently introduced IOMMU_PAGE_XXX macros do not include
> >> IOMMU_PAGE_SHIFT.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> Reviewed-by: David Gibson 
> >> ---
> >>   drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
> >>   1 file changed, 13 insertions(+), 13 deletions(-)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> >> b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> index f835e63..8bbee22 100644
> >> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container 
> >> *container)
> >> * enforcing the limit based on the max that the guest can map.
> >> */
> >>down_write(¤t->mm->mmap_sem);
> >> -  npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> >> +  npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> >>locked = current->mm->locked_vm + npages;
> >>lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >>if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> >> @@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container 
> >> *container)
> >>
> >>down_write(¤t->mm->mmap_sem);
> >>current->mm->locked_vm -= (container->tbl->it_size <<
> >> -  IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> >> +  container->tbl->it_page_shift) >> PAGE_SHIFT;
> >>up_write(¤t->mm->mmap_sem);
> >>   }
> >>
> >> @@ -222,7 +222,7 @@ static long tce_iommu_build(struct tce_container 
> >> *container,
> >>tce, ret);
> >>break;
> >>}
> >> -  tce += IOMMU_PAGE_SIZE_4K;
> >> +  tce += IOMMU_PAGE_SIZE(tbl);
> >
> >
> > Is PAGE_SIZE ever smaller than IOMMU_PAGE_SIZE(tbl)?  IOW, can the page
> > we got from get_user_pages_fast() ever not completely fill the tce
> > entry?
> 
> 
> Yes. IOMMU_PAGE_SIZE is 4K/64K/16M (16M is with huge pages enabled in QEMU 
> with -mempath), PAGE_SIZE is 4K/64K (normally 64K).

Isn't that a problem then that you're filling the tce with processor
page sizes via get_user_pages_fast(), but incrementing the tce by by
IOMMU page size?  For example, if PAGE_SIZE = 4K and IOMMU_PAGE_SIZE !=
4K have we really pinned all of the memory backed by the tce?  Where do
you make sure the 4K page is really contiguous for the IOMMU page?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 12/31] powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

2015-04-01 Thread Alex Williamson
On Thu, 2015-04-02 at 13:33 +1100, Alexey Kardashevskiy wrote:
> On 04/02/2015 08:48 AM, Alex Williamson wrote:
> > On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> >> Modern IBM POWERPC systems support multiple (currently two) TCE tables
> >> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
> >> for TCE tables. Right now just one table is supported.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >>   Documentation/vfio.txt  |  23 ++
> >>   arch/powerpc/include/asm/iommu.h|  18 +++--
> >>   arch/powerpc/kernel/iommu.c |  34 
> >>   arch/powerpc/platforms/powernv/pci-ioda.c   |  38 +
> >>   arch/powerpc/platforms/powernv/pci-p5ioc2.c |  17 ++--
> >>   arch/powerpc/platforms/powernv/pci.c|   2 +-
> >>   arch/powerpc/platforms/powernv/pci.h|   4 +-
> >>   arch/powerpc/platforms/pseries/iommu.c  |   9 ++-
> >>   drivers/vfio/vfio_iommu_spapr_tce.c | 120 
> >> 
> >>   9 files changed, 183 insertions(+), 82 deletions(-)
> >>
> >> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >> index 96978ec..94328c8 100644
> >> --- a/Documentation/vfio.txt
> >> +++ b/Documentation/vfio.txt
> >> @@ -427,6 +427,29 @@ The code flow from the example above should be 
> >> slightly changed:
> >>
> >>
> >>
> >> +5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
> >> +VFIO_IOMMU_DISABLE and implements 2 new ioctls:
> >> +VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
> >> +(which are unsupported in v1 IOMMU).
> >> +
> >> +PPC64 paravirtualized guests generate a lot of map/unmap requests,
> >> +and the handling of those includes pinning/unpinning pages and updating
> >> +mm::locked_vm counter to make sure we do not exceed the rlimit.
> >> +The v2 IOMMU splits accounting and pinning into separate operations:
> >> +
> >> +- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY 
> >> ioctls
> >> +receive a user space address and size of the block to be pinned.
> >> +Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
> >> +be called with the exact address and size used for registering
> >> +the memory block. The userspace is not expected to call these often.
> >> +The ranges are stored in a linked list in a VFIO container.
> >> +
> >> +- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
> >> +IOMMU table and do not do pinning; instead these check that the userspace
> >> +address is from pre-registered range.
> >> +
> >> +This separation helps in optimizing DMA for guests.
> >> +
> >>   
> >> ---
> >>
> >>   [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> >> diff --git a/arch/powerpc/include/asm/iommu.h 
> >> b/arch/powerpc/include/asm/iommu.h
> >> index eb75726..667aa1a 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -90,9 +90,7 @@ struct iommu_table {
> >>struct iommu_pool pools[IOMMU_NR_POOLS];
> >>unsigned long *it_map;   /* A simple allocation bitmap for now */
> >>unsigned long  it_page_shift;/* table iommu page size */
> >> -#ifdef CONFIG_IOMMU_API
> >> -  struct iommu_group *it_group;
> >> -#endif
> >> +  struct iommu_table_group *it_group;
> >>struct iommu_table_ops *it_ops;
> >>void (*set_bypass)(struct iommu_table *tbl, bool enable);
> >>   };
> >> @@ -126,14 +124,24 @@ extern void iommu_free_table(struct iommu_table 
> >> *tbl, const char *node_name);
> >>*/
> >>   extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
> >>int nid);
> >> +
> >> +#define IOMMU_TABLE_GROUP_MAX_TABLES  1
> >> +
> >> +struct iommu_table_group {
> >>   #ifdef CONFIG_IOMMU_API
> >> -extern void iommu_register_group(struct iommu_table *tbl,
> >> +  struct iommu_group *group;
> >> +#endif
> >> +  struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> >> +};
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +extern void io

Re: [PATCH kernel v7 04/31] vfio: powerpc/spapr: Use it_page_size

2015-04-01 Thread Alex Williamson
On Thu, 2015-04-02 at 14:40 +1100, Alexey Kardashevskiy wrote:
> On 04/02/2015 01:50 PM, Alex Williamson wrote:
> > On Thu, 2015-04-02 at 13:30 +1100, Alexey Kardashevskiy wrote:
> >> On 04/02/2015 08:48 AM, Alex Williamson wrote:
> >>> On Sat, 2015-03-28 at 01:54 +1100, Alexey Kardashevskiy wrote:
> >>>> This makes use of the it_page_size from the iommu_table struct
> >>>> as page size can differ.
> >>>>
> >>>> This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
> >>>> as recently introduced IOMMU_PAGE_XXX macros do not include
> >>>> IOMMU_PAGE_SHIFT.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy 
> >>>> Reviewed-by: David Gibson 
> >>>> ---
> >>>>drivers/vfio/vfio_iommu_spapr_tce.c | 26 +-
> >>>>1 file changed, 13 insertions(+), 13 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> >>>> b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> index f835e63..8bbee22 100644
> >>>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>>> @@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container 
> >>>> *container)
> >>>>   * enforcing the limit based on the max that the guest can map.
> >>>>   */
> >>>>  down_write(¤t->mm->mmap_sem);
> >>>> -npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> >>>> +npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> >>>>  locked = current->mm->locked_vm + npages;
> >>>>  lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >>>>  if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> >>>> @@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container 
> >>>> *container)
> >>>>
> >>>>  down_write(¤t->mm->mmap_sem);
> >>>>  current->mm->locked_vm -= (container->tbl->it_size <<
> >>>> -IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
> >>>> +container->tbl->it_page_shift) >> PAGE_SHIFT;
> >>>>  up_write(¤t->mm->mmap_sem);
> >>>>}
> >>>>
> >>>> @@ -222,7 +222,7 @@ static long tce_iommu_build(struct tce_container 
> >>>> *container,
> >>>>  tce, ret);
> >>>>  break;
> >>>>  }
> >>>> -tce += IOMMU_PAGE_SIZE_4K;
> >>>> +tce += IOMMU_PAGE_SIZE(tbl);
> >>>
> >>>
> >>> Is PAGE_SIZE ever smaller than IOMMU_PAGE_SIZE(tbl)?  IOW, can the page
> >>> we got from get_user_pages_fast() ever not completely fill the tce
> >>> entry?
> >>
> >>
> >> Yes. IOMMU_PAGE_SIZE is 4K/64K/16M (16M is with huge pages enabled in QEMU
> >> with -mempath), PAGE_SIZE is 4K/64K (normally 64K).
> >
> > Isn't that a problem then that you're filling the tce with processor
> > page sizes via get_user_pages_fast(), but incrementing the tce by by
> > IOMMU page size?  For example, if PAGE_SIZE = 4K and IOMMU_PAGE_SIZE !=
> > 4K have we really pinned all of the memory backed by the tce?Where do
> > you make sure the 4K page is really contiguous for the IOMMU page?
> 
> 
> Aaaah. This is just not supported. Instead, after the previous patch 
> ("vfio: powerpc/spapr: Check that TCE page size is equal to it_page_size", 
> which need fixed subject), tce_page_is_contained(page4K, 64K) will return 
> false and the caller - tce_iommu_build() - will return -EPERM.

Ok, makes sense.  Thanks

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 26/31] powerpc/iommu: Add userspace view of TCE table

2015-04-02 Thread Alex Williamson

Should have sent this with the other comments, but found it hiding on my
desktop...

On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> In order to support memory pre-registration, we need a way to track
> the use of every registered memory region and only allow unregistration
> if a region is not in use anymore. So we need a way to tell from what
> region the just cleared TCE was from.
> 
> This adds a userspace view of the TCE table into iommu_table struct.
> It contains userspace address, one per TCE entry. The table is only
> allocated when the ownership over an IOMMU group is taken which means
> it is only used from outside of the powernv code (such as VFIO).
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/iommu.h  |  6 ++
>  arch/powerpc/kernel/iommu.c   |  7 +++
>  arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++-
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index 2c08c91..a768a4d 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -106,9 +106,15 @@ struct iommu_table {
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
>   unsigned long  it_page_shift;/* table iommu page size */
>   struct iommu_table_group *it_group;
> + unsigned long *it_userspace; /* userspace view of the table */
>   struct iommu_table_ops *it_ops;
>  };
>  
> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> + ((tbl)->it_userspace ? \
> + &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> + NULL)
> +
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
>  int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0bcd988..82102d1 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -38,6 +38,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct 
> iommu_table *tbl)
>   spin_unlock(&tbl->pools[i].lock);
>   spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>  
> + BUG_ON(tbl->it_userspace);
> + tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> +

-ENOMEM?

>   return 0;
>  }
>  
> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct 
> iommu_table *tbl)
>  {
>   unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>  
> + vfree(tbl->it_userspace);
> + tbl->it_userspace = NULL;
> +
>   spin_lock_irqsave(&tbl->large_pool.lock, flags);
>   for (i = 0; i < tbl->nr_pools; i++)
>   spin_lock(&tbl->pools[i].lock);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index bc36cf1..036f3c1 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -26,6 +26,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
>   if (!tbl->it_size)
>   return;
>  
> + if (tbl->it_userspace)

Not necessary

> + vfree(tbl->it_userspace);
> +

Why no NULL setting this time?

>   pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
>   iommu_reset_table(tbl, "ioda2");
>  }
> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct 
> iommu_table_group *table_group,
>   pnv_pci_ioda2_set_bypass(pe, !enable);
>  }
>  
> +static long pnv_pci_ioda2_create_table_with_uas(
> + struct iommu_table_group *table_group,
> + int num, __u32 page_shift, __u64 window_size, __u32 levels,
> + struct iommu_table *tbl)
> +{
> + long ret = pnv_pci_ioda2_create_table(table_group, num,
> + page_shift, window_size, levels, tbl);
> +
> + if (ret)
> + return ret;
> +
> + BUG_ON(tbl->it_userspace);
> + tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);

-ENOMEM

> +
> + return 0;
> +}
> +
>  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>   .set_ownership = pnv_ioda2_set_ownership,
> - .create_table = pnv_pci_ioda2_create_table,
> + .create_table = pnv_pci_ioda2_create_table_with_uas,
>   .set_window = pnv_pci_ioda2_set_window,
>   .unset_window = pnv_pci_ioda2_unset_window,
>  };



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 26/31] powerpc/iommu: Add userspace view of TCE table

2015-04-08 Thread Alex Williamson
On Wed, 2015-04-08 at 13:22 +1000, Alexey Kardashevskiy wrote:
> On 04/03/2015 07:50 AM, Alex Williamson wrote:
> >
> > Should have sent this with the other comments, but found it hiding on my
> > desktop...
> >
> > On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> >> In order to support memory pre-registration, we need a way to track
> >> the use of every registered memory region and only allow unregistration
> >> if a region is not in use anymore. So we need a way to tell from what
> >> region the just cleared TCE was from.
> >>
> >> This adds a userspace view of the TCE table into iommu_table struct.
> >> It contains userspace address, one per TCE entry. The table is only
> >> allocated when the ownership over an IOMMU group is taken which means
> >> it is only used from outside of the powernv code (such as VFIO).
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >>   arch/powerpc/include/asm/iommu.h  |  6 ++
> >>   arch/powerpc/kernel/iommu.c   |  7 +++
> >>   arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++-
> >>   3 files changed, 35 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h 
> >> b/arch/powerpc/include/asm/iommu.h
> >> index 2c08c91..a768a4d 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -106,9 +106,15 @@ struct iommu_table {
> >>unsigned long *it_map;   /* A simple allocation bitmap for now */
> >>unsigned long  it_page_shift;/* table iommu page size */
> >>struct iommu_table_group *it_group;
> >> +  unsigned long *it_userspace; /* userspace view of the table */
> >>struct iommu_table_ops *it_ops;
> >>   };
> >>
> >> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> >> +  ((tbl)->it_userspace ? \
> >> +  &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> >> +  NULL)
> >> +
> >>   /* Pure 2^n version of get_order */
> >>   static inline __attribute_const__
> >>   int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index 0bcd988..82102d1 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -38,6 +38,7 @@
> >>   #include 
> >>   #include 
> >>   #include 
> >> +#include 
> >>   #include 
> >>   #include 
> >>   #include 
> >> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct 
> >> iommu_table *tbl)
> >>spin_unlock(&tbl->pools[i].lock);
> >>spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
> >>
> >> +  BUG_ON(tbl->it_userspace);
> >> +  tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> >> +
> >
> > -ENOMEM?
> >
> >>return 0;
> >>   }
> >>
> >> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct 
> >> iommu_table *tbl)
> >>   {
> >>unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> >>
> >> +  vfree(tbl->it_userspace);
> >> +  tbl->it_userspace = NULL;
> >> +
> >>spin_lock_irqsave(&tbl->large_pool.lock, flags);
> >>for (i = 0; i < tbl->nr_pools; i++)
> >>spin_lock(&tbl->pools[i].lock);
> >> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> >> b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> index bc36cf1..036f3c1 100644
> >> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> @@ -26,6 +26,7 @@
> >>   #include 
> >>   #include 
> >>   #include 
> >> +#include 
> >>
> >>   #include 
> >>   #include 
> >> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table 
> >> *tbl)
> >>if (!tbl->it_size)
> >>return;
> >>
> >> +  if (tbl->it_userspace)
> >
> > Not necessary
> 
> Out of curiosity - why? Is every single implementation is known for 
> checking the argument?

AFAIK, all flavors of free in the kernel accept NULL pointers and do the
right thing.  I verified this one does t

Re: [PATCH kernel v8 15/31] powerpc/iommu: Fix IOMMU ownership control functions

2015-04-10 Thread Alex Williamson
On Fri, 2015-04-10 at 16:30 +1000, Alexey Kardashevskiy wrote:
> This adds missing locks in iommu_take_ownership()/
> iommu_release_ownership().
> 
> This marks all pages busy in iommu_table::it_map in order to catch
> errors if there is an attempt to use this table while ownership over it
> is taken.
> 
> This only clears TCE content if there is no page marked busy in it_map.
> Clearing must be done outside of the table locks as iommu_clear_tce()
> called from iommu_clear_tces_and_put_pages() does this.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v5:
> * do not store bit#0 value, it has to be set for zero-based table
> anyway
> * removed test_and_clear_bit
> ---
>  arch/powerpc/kernel/iommu.c | 26 ++
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 7d6089b..068fe4ff 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -1052,17 +1052,28 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
>  
>  static int iommu_table_take_ownership(struct iommu_table *tbl)
>  {
> - unsigned long sz = (tbl->it_size + 7) >> 3;
> + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> + int ret = 0;
> +
> + spin_lock_irqsave(&tbl->large_pool.lock, flags);
> + for (i = 0; i < tbl->nr_pools; i++)
> + spin_lock(&tbl->pools[i].lock);
>  
>   if (tbl->it_offset == 0)
>   clear_bit(0, tbl->it_map);
>  
>   if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
>   pr_err("iommu_tce: it_map is not empty");
> - return -EBUSY;
> + ret = -EBUSY;


This error is never returned.


> + if (tbl->it_offset == 0)
> + set_bit(0, tbl->it_map);
> + } else {
> + memset(tbl->it_map, 0xff, sz);
>   }
>  
> - memset(tbl->it_map, 0xff, sz);
> + for (i = 0; i < tbl->nr_pools; i++)
> + spin_unlock(&tbl->pools[i].lock);
> + spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>  
>   return 0;
>  }
> @@ -1095,7 +1106,11 @@ EXPORT_SYMBOL_GPL(iommu_take_ownership);
>  
>  static void iommu_table_release_ownership(struct iommu_table *tbl)
>  {
> - unsigned long sz = (tbl->it_size + 7) >> 3;
> + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> +
> + spin_lock_irqsave(&tbl->large_pool.lock, flags);
> + for (i = 0; i < tbl->nr_pools; i++)
> + spin_lock(&tbl->pools[i].lock);
>  
>   memset(tbl->it_map, 0, sz);
>  
> @@ -1103,6 +1118,9 @@ static void iommu_table_release_ownership(struct 
> iommu_table *tbl)
>   if (tbl->it_offset == 0)
>   set_bit(0, tbl->it_map);
>  
> + for (i = 0; i < tbl->nr_pools; i++)
> + spin_unlock(&tbl->pools[i].lock);
> + spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>  }
>  
>  extern void iommu_release_ownership(struct iommu_table_group *table_group)



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v8 26/31] powerpc/iommu: Add userspace view of TCE table

2015-04-10 Thread Alex Williamson
On Fri, 2015-04-10 at 16:31 +1000, Alexey Kardashevskiy wrote:
> In order to support memory pre-registration, we need a way to track
> the use of every registered memory region and only allow unregistration
> if a region is not in use anymore. So we need a way to tell from what
> region the just cleared TCE was from.
> 
> This adds a userspace view of the TCE table into iommu_table struct.
> It contains userspace address, one per TCE entry. The table is only
> allocated when the ownership over an IOMMU group is taken which means
> it is only used from outside of the powernv code (such as VFIO).
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v8:
> * added ENOMEM on failed vzalloc()
> ---
>  arch/powerpc/include/asm/iommu.h  |  6 ++
>  arch/powerpc/kernel/iommu.c   |  9 +
>  arch/powerpc/platforms/powernv/pci-ioda.c | 25 -
>  3 files changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index 2c08c91..a768a4d 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -106,9 +106,15 @@ struct iommu_table {
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
>   unsigned long  it_page_shift;/* table iommu page size */
>   struct iommu_table_group *it_group;
> + unsigned long *it_userspace; /* userspace view of the table */
>   struct iommu_table_ops *it_ops;
>  };
>  
> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> + ((tbl)->it_userspace ? \
> + &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> + NULL)
> +
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
>  int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0bcd988..833b396 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -38,6 +38,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1069,6 +1070,11 @@ static int iommu_table_take_ownership(struct 
> iommu_table *tbl)
>   spin_unlock(&tbl->pools[i].lock);
>   spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>  
> + BUG_ON(tbl->it_userspace);
> + tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> + if (!tbl->it_userspace)
> + return -ENOMEM;
> +

It would really make more sense from an error path perspective in this
function if the vzalloc where done first.  Doing it at the end, you need
to consider whether anything previous needs to be un-done.  Also note
that this -ENOMEM return clobbers the -EBUSY if you fix 15/31 to return
"ret".

>   return 0;
>  }
>  
> @@ -1102,6 +1108,9 @@ static void iommu_table_release_ownership(struct 
> iommu_table *tbl)
>  {
>   unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>  
> + vfree(tbl->it_userspace);
> + tbl->it_userspace = NULL;
> +
>   spin_lock_irqsave(&tbl->large_pool.lock, flags);
>   for (i = 0; i < tbl->nr_pools; i++)
>   spin_lock(&tbl->pools[i].lock);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 751aeab..3ac523d 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -26,6 +26,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
>   if (!tbl->it_size)
>   return;
>  
> + vfree(tbl->it_userspace);
> + tbl->it_userspace = NULL;
> +
>   pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
>   iommu_reset_table(tbl, "ioda2");
>  }
> @@ -1656,9 +1660,28 @@ static void pnv_ioda2_set_ownership(struct 
> iommu_table_group *table_group,
>   pnv_pci_ioda2_set_bypass(pe, !enable);
>  }
>  
> +static long pnv_pci_ioda2_create_table_with_uas(
> + struct iommu_table_group *table_group,
> + int num, __u32 page_shift, __u64 window_size, __u32 levels,
> + struct iommu_table *tbl)
> +{
> + long ret = pnv_pci_ioda2_create_table(table_group, num,
> + page_shift, window_size, levels, tbl);
> +
> + if (ret)
> + return ret;
> +
> + BUG_ON(tbl->it_userspace);
> + tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> + if (!tbl->it_userspace)
> + return -ENOMEM;


So all of the work done in pnv_pci_ioda2_create_table() can just be
ignored, we undo nothing and return -ENOMEM?  Again, doing the
allocation first might make a lot more sense than slapping on an -ENOMEM
and calling the error handling "good".

> +
> + return 0;
> +}
> +
>  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>  

Re: [PATCH kernel v8 00/31] powerpc/iommu/vfio: Enable Dynamic DMA windows

2015-04-10 Thread Alex Williamson
/iommu: Move tce_xxx callbacks from ppc_md to iommu_table
>   powerpc/iommu: Introduce iommu_table_alloc() helper
>   powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group
>   vfio: powerpc/spapr: powerpc/iommu: Rework IOMMU ownership control
>   vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework IOMMU ownership
> control
>   powerpc/iommu: Fix IOMMU ownership control functions
>   powerpc/powernv/ioda/ioda2: Rework tce_build()/tce_free()
>   powerpc/iommu/powernv: Release replaced TCE
>   powerpc/powernv/ioda2: Rework iommu_table creation
>   powerpc/powernv/ioda2: Introduce
> pnv_pci_ioda2_create_table/pnc_pci_free_table
>   powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window
>   powerpc/iommu: Split iommu_free_table into 2 helpers
>   powerpc/powernv: Implement multilevel TCE tables
>   powerpc/powernv: Change prototypes to receive iommu
>   powerpc/powernv/ioda: Define and implement DMA table/window management
> callbacks
>   vfio: powerpc/spapr: powerpc/powernv/ioda2: Rework ownership
>   powerpc/iommu: Add userspace view of TCE table
>   powerpc/iommu/ioda2: Add get_table_size() to calculate the size of
> fiture table
>   powerpc/mmu: Add userspace-to-physical addresses translation cache
>   vfio: powerpc/spapr: Register memory and define IOMMU v2
>   vfio: powerpc/spapr: Support multiple groups in one container if
> possible
>   vfio: powerpc/spapr: Support Dynamic DMA windows
> 
>  Documentation/vfio.txt  |   50 +-
>  arch/powerpc/include/asm/iommu.h|  111 ++-
>  arch/powerpc/include/asm/machdep.h  |   25 -
>  arch/powerpc/include/asm/mmu-hash64.h   |3 +
>  arch/powerpc/include/asm/mmu_context.h  |   17 +
>  arch/powerpc/kernel/iommu.c |  336 +
>  arch/powerpc/kernel/vio.c   |5 +
>  arch/powerpc/mm/Makefile|1 +
>  arch/powerpc/mm/mmu_context_hash64.c|6 +
>  arch/powerpc/mm/mmu_context_hash64_iommu.c  |  215 ++
>  arch/powerpc/platforms/cell/iommu.c |8 +-
>  arch/powerpc/platforms/pasemi/iommu.c   |7 +-
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  589 ---
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |   33 +-
>  arch/powerpc/platforms/powernv/pci.c|  116 ++-
>  arch/powerpc/platforms/powernv/pci.h|   12 +-
>  arch/powerpc/platforms/pseries/iommu.c  |   55 +-
>  arch/powerpc/sysdev/dart_iommu.c    |   12 +-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 1021 
> ---
>  include/uapi/linux/vfio.h   |   88 ++-
>  20 files changed, 2218 insertions(+), 492 deletions(-)
>  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c


There are still some issues that need to be addressed in arch code, I've
noted them in comments for patches 15 & 26.  I think I've run out of
issues for the vfio changes, so for the vfio related changes in patches
1-8,12-14,17,25,29-31:

Acked-by: Alex Williamson 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v7 26/31] powerpc/iommu: Add userspace view of TCE table

2015-04-08 Thread Alex Williamson
On Thu, 2015-04-09 at 13:21 +1000, Alexey Kardashevskiy wrote:
> On 04/09/2015 01:43 AM, Alex Williamson wrote:
> > On Wed, 2015-04-08 at 13:22 +1000, Alexey Kardashevskiy wrote:
> >> On 04/03/2015 07:50 AM, Alex Williamson wrote:
> >>>
> >>> Should have sent this with the other comments, but found it hiding on my
> >>> desktop...
> >>>
> >>> On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> >>>> In order to support memory pre-registration, we need a way to track
> >>>> the use of every registered memory region and only allow unregistration
> >>>> if a region is not in use anymore. So we need a way to tell from what
> >>>> region the just cleared TCE was from.
> >>>>
> >>>> This adds a userspace view of the TCE table into iommu_table struct.
> >>>> It contains userspace address, one per TCE entry. The table is only
> >>>> allocated when the ownership over an IOMMU group is taken which means
> >>>> it is only used from outside of the powernv code (such as VFIO).
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy 
> >>>> ---
> >>>>arch/powerpc/include/asm/iommu.h  |  6 ++
> >>>>arch/powerpc/kernel/iommu.c   |  7 +++
> >>>>arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++-
> >>>>3 files changed, 35 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/arch/powerpc/include/asm/iommu.h 
> >>>> b/arch/powerpc/include/asm/iommu.h
> >>>> index 2c08c91..a768a4d 100644
> >>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>> @@ -106,9 +106,15 @@ struct iommu_table {
> >>>>  unsigned long *it_map;   /* A simple allocation bitmap for 
> >>>> now */
> >>>>  unsigned long  it_page_shift;/* table iommu page size */
> >>>>  struct iommu_table_group *it_group;
> >>>> +unsigned long *it_userspace; /* userspace view of the table */
> >>>>  struct iommu_table_ops *it_ops;
> >>>>};
> >>>>
> >>>> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> >>>> +((tbl)->it_userspace ? \
> >>>> +&((tbl)->it_userspace[(entry) - 
> >>>> (tbl)->it_offset]) : \
> >>>> +NULL)
> >>>> +
> >>>>/* Pure 2^n version of get_order */
> >>>>static inline __attribute_const__
> >>>>int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> >>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>> index 0bcd988..82102d1 100644
> >>>> --- a/arch/powerpc/kernel/iommu.c
> >>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>> @@ -38,6 +38,7 @@
> >>>>#include 
> >>>>#include 
> >>>>#include 
> >>>> +#include 
> >>>>#include 
> >>>>#include 
> >>>>#include 
> >>>> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct 
> >>>> iommu_table *tbl)
> >>>>  spin_unlock(&tbl->pools[i].lock);
> >>>>  spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
> >>>>
> >>>> +BUG_ON(tbl->it_userspace);
> >>>> +tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * 
> >>>> tbl->it_size);
> >>>> +
> >>>
> >>> -ENOMEM?
> >>>
> >>>>  return 0;
> >>>>}
> >>>>
> >>>> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct 
> >>>> iommu_table *tbl)
> >>>>{
> >>>>  unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> >>>>
> >>>> +vfree(tbl->it_userspace);
> >>>> +tbl->it_userspace = NULL;
> >>>> +
> >>>>  spin_lock_irqsave(&tbl->large_pool.lock, flags);
> >>>>  for (i = 0; i < tbl->nr_pools; i++)
> >>>>  spin_lock(&tbl->pools[i].lock);
> >>>> dif

Re: [PATCH kernel v9 02/32] Revert "powerpc/powernv: Allocate struct pnv_ioda_pe iommu_table dynamically"

2015-04-27 Thread Alex Williamson
On Sat, 2015-04-25 at 22:14 +1000, Alexey Kardashevskiy wrote:
> This reverts commit 9e8d4a19ab66ec9e132d405357b9108a4f26efd3 as
> tce32_table has exactly the same life time as the whole PE.

scripts/checkpatch.pl would like your commit reference to appear as:

commit 9e8d4a19ab66 ("powerpc/powernv: Allocate struct pnv_ioda_pe
iommu_table dynamically")

> 
> This makes use of a new iommu_reset_table() helper instead.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/iommu.h  |  3 ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 35 
> +--
>  arch/powerpc/platforms/powernv/pci.h  |  2 +-
>  3 files changed, 15 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index e2cef38..9d320e0 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -79,9 +79,6 @@ struct iommu_table {
>   struct iommu_group *it_group;
>  #endif
>   void (*set_bypass)(struct iommu_table *tbl, bool enable);
> -#ifdef CONFIG_PPC_POWERNV
> - void   *data;
> -#endif
>  };
>  
>  /* Pure 2^n version of get_order */
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 920c252..eff26ed 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1086,10 +1086,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, 
> int all)
>   return;
>   }
>  
> - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> - GFP_KERNEL, hose->node);
> - pe->tce32_table->data = pe;
> -
>   /* Associate it with all child devices */
>   pnv_ioda_setup_same_PE(bus, pe);
>  
> @@ -1295,7 +1291,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
> *dev, struct pnv_ioda_pe
>   bus = dev->bus;
>   hose = pci_bus_to_host(bus);
>   phb = hose->private_data;
> - tbl = pe->tce32_table;
> + tbl = &pe->tce32_table;
>   addr = tbl->it_base;
>  
>   opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
> @@ -1310,9 +1306,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev 
> *dev, struct pnv_ioda_pe
>   if (rc)
>   pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
>  
> - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
> + iommu_reset_table(tbl, of_node_full_name(dev->dev.of_node));
>   free_pages(addr, get_order(TCE32_TABLE_SIZE));
> - pe->tce32_table = NULL;
>  }
>  
>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
> @@ -1460,10 +1455,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
> u16 num_vfs)
>   continue;
>   }
>  
> - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
> - GFP_KERNEL, hose->node);
> - pe->tce32_table->data = pe;
> -
>   /* Put PE to the list */
>   mutex_lock(&phb->ioda.pe_list_mutex);
>   list_add_tail(&pe->list, &phb->ioda.pe_list);
> @@ -1598,7 +1589,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb 
> *phb, struct pci_dev *pdev
>  
>   pe = &phb->ioda.pe_array[pdn->pe_number];
>   WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
> - set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
> + set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
>  }
>  
>  static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
> @@ -1625,7 +1616,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb 
> *phb,
>   } else {
>   dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
>   set_dma_ops(&pdev->dev, &dma_iommu_ops);
> - set_iommu_table_base(&pdev->dev, pe->tce32_table);
> + set_iommu_table_base(&pdev->dev, &pe->tce32_table);
>   }
>   *pdev->dev.dma_mask = dma_mask;
>   return 0;
> @@ -1662,9 +1653,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe 
> *pe,
>   list_for_each_entry(dev, &bus->devices, bus_list) {
>   if (add_to_iommu_group)
>   set_iommu_table_base_and_group(&dev->dev,
> -pe->tce32_table);
> +&pe->tce32_table);
>   else
> - set_iommu_table_base(&dev->dev, pe->tce32_table);
> + set_iommu_table_base(&dev->dev, &pe->tce32_table);
>  
>   if (dev->subordinate)
>   pnv_ioda_setup_bus_dma(pe, dev->subordinate,
> @@ -1754,7 +1745,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
> pnv_ioda_pe *pe,
>  void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
>__be64 *startp, __be64 *endp, bool rm)
>  {
> - struct pnv_ioda_pe *pe = tbl->data;
> + struct p

Re: [PATCH kernel v9 16/32] powerpc/powernv/ioda: Move TCE kill register address to PE

2015-04-27 Thread Alex Williamson
On Sat, 2015-04-25 at 22:14 +1000, Alexey Kardashevskiy wrote:
> At the moment the DMA setup code looks for the "ibm,opal-tce-kill" property
> which contains the TCE kill register address. Writes to this register
> invalidates TCE cache on IODA/IODA2 hub.
> 
> This moves the register address from iommu_table to pnv_ioda_pe as
> later there will be 2 tables per PE and it will be used for both tables.
> 
> This moves the property reading/remapping code to a helper to reduce
> code duplication.
> 
> This adds a new pnv_pci_ioda2_tvt_invalidate() helper which invalidates
> the entire table. It should be called after every call to
> opal_pci_map_pe_dma_window(). It was not required before because
> there is just a single TCE table and 64bit DMA is handled via bypass
> window (which has no table so no chache is used) but this is going
> to change with Dynamic DMA windows (DDW).
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v9:
> * new in the series
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 69 
> +++
>  arch/powerpc/platforms/powernv/pci.h  |  1 +
>  2 files changed, 44 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index f070c44..b22b3ca 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1672,7 +1672,7 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
> iommu_table *tbl,
>   struct pnv_ioda_pe, table_group);
>   __be64 __iomem *invalidate = rm ?
>   (__be64 __iomem *)pe->tce_inval_reg_phys :
> - (__be64 __iomem *)tbl->it_index;
> + pe->tce_inval_reg;
>   unsigned long start, end, inc;
>   const unsigned shift = tbl->it_page_shift;
>  
> @@ -1743,6 +1743,18 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
>   .get = pnv_tce_get,
>  };
>  
> +static inline void pnv_pci_ioda2_tvt_invalidate(struct pnv_ioda_pe *pe)
> +{
> + /* 01xb - invalidate TCEs that match the specified PE# */
> + unsigned long addr = (0x4ull << 60) | (pe->pe_number & 0xFF);
> +
> + if (!pe->tce_inval_reg)
> + return;
> +
> +mb(); /* Ensure above stores are visible */


ERROR: code indent should use tabs where possible


> + __raw_writeq(cpu_to_be64(addr), pe->tce_inval_reg);
> +}
> +
>  static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
>   unsigned long index, unsigned long npages, bool rm)
>  {
> @@ -1751,7 +1763,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
> iommu_table *tbl,
>   unsigned long start, end, inc;
>   __be64 __iomem *invalidate = rm ?
>   (__be64 __iomem *)pe->tce_inval_reg_phys :
> - (__be64 __iomem *)tbl->it_index;
> + pe->tce_inval_reg;
>   const unsigned shift = tbl->it_page_shift;
>  
>   /* We'll invalidate DMA address in PE scope */
> @@ -1803,13 +1815,31 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>   .get = pnv_tce_get,
>  };
>  
> +static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb,
> + struct pnv_ioda_pe *pe)
> +{
> + const __be64 *swinvp;
> +
> + /* OPAL variant of PHB3 invalidated TCEs */
> + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
> + if (!swinvp)
> + return;
> +
> + /* We need a couple more fields -- an address and a data
> +  * to or.  Since the bus is only printed out on table free
> +  * errors, and on the first pass the data will be a relative
> +  * bus number, print that out instead.
> +  */
> + pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
> + pe->tce_inval_reg = ioremap(pe->tce_inval_reg_phys, 8);
> +}
> +
>  static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
> struct pnv_ioda_pe *pe, unsigned int base,
> unsigned int segs)
>  {
>  
>   struct page *tce_mem = NULL;
> - const __be64 *swinvp;
>   struct iommu_table *tbl;
>   unsigned int i;
>   int64_t rc;
> @@ -1823,6 +1853,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
> *phb,
>   if (WARN_ON(pe->tce32_seg >= 0))
>   return;
>  
> + pnv_pci_ioda_setup_opal_tce_kill(phb, pe);
> +
>   /* Grab a 32-bit TCE table */
>   pe->tce32_seg = base;
>   pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
> @@ -1865,20 +1897,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
> *phb,
> base << 28, IOMMU_PAGE_SHIFT_4K);
>  
>   /* OPAL variant of P7IOC SW invalidated TCEs */
> - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
> - if (swinvp) {
> - /* We need a couple more fields -- an address and a data
> -  * to or.  Since the bus is only printed out on table free
> -  * errors

Re: [PATCH kernel v9 30/32] vfio: powerpc/spapr: Use 32bit DMA window properties from table_group

2015-04-27 Thread Alex Williamson
On Sat, 2015-04-25 at 22:14 +1000, Alexey Kardashevskiy wrote:
> A table group might not have a table but it always has the default 32bit
> window parameters so use these.
> 
> No change in behavior is expected.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v9:
> * new in the series - to make the next patch simpler
> ---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++
>  1 file changed, 11 insertions(+), 8 deletions(-)


Acked-by: Alex Williamson 


> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 4cfc2c1..a7d6729 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -185,7 +185,6 @@ static int tce_iommu_enable(struct tce_container 
> *container)
>  {
>   int ret = 0;
>   unsigned long locked;
> - struct iommu_table *tbl;
>   struct iommu_table_group *table_group;
>  
>   if (!container->grp)
> @@ -221,13 +220,19 @@ static int tce_iommu_enable(struct tce_container 
> *container)
>* this is that we cannot tell here the amount of RAM used by the guest
>* as this information is only available from KVM and VFIO is
>* KVM agnostic.
> +  *
> +  * So we do not allow enabling a container without a group attached
> +  * as there is no way to know how much we should increment
> +  * the locked_vm counter.
>*/
>   table_group = iommu_group_get_iommudata(container->grp);
>   if (!table_group)
>   return -ENODEV;
>  
> - tbl = &table_group->tables[0];
> - locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
> + if (!table_group->tce32_size)
> + return -EPERM;
> +
> + locked = table_group->tce32_size >> PAGE_SHIFT;
>   ret = try_increment_locked_vm(locked);
>   if (ret)
>   return ret;
> @@ -504,7 +509,6 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>   case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>   struct vfio_iommu_spapr_tce_info info;
> - struct iommu_table *tbl;
>   struct iommu_table_group *table_group;
>  
>   if (WARN_ON(!container->grp))
> @@ -512,8 +516,7 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>   table_group = iommu_group_get_iommudata(container->grp);
>  
> - tbl = &table_group->tables[0];
> - if (WARN_ON_ONCE(!tbl))
> + if (!table_group)
>   return -ENXIO;
>  
>   minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> @@ -525,8 +528,8 @@ static long tce_iommu_ioctl(void *iommu_data,
>   if (info.argsz < minsz)
>   return -EINVAL;
>  
> - info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
> - info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
> + info.dma32_window_start = table_group->tce32_start;
> + info.dma32_window_size = table_group->tce32_size;
>   info.flags = 0;
>  
>   if (copy_to_user((void __user *)arg, &info, minsz))



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH kernel v10 16/34] powerpc/spapr: vfio: Replace iommu_table with iommu_table_group

2015-05-13 Thread Alex Williamson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> Modern IBM POWERPC systems support multiple (currently two) TCE tables
> per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
> for TCE tables. Right now just one table is supported.
> 
> This defines iommu_table_group struct which stores pointers to
> iommu_group and iommu_table(s). This replaces iommu_table with
> iommu_table_group where iommu_table was used to identify a group:
> - iommu_register_group();
> - iommudata of generic iommu_group;
> 
> This removes @data from iommu_table as it_table_group provides
> same access to pnv_ioda_pe.
> 
> For IODA, instead of embedding iommu_table, the new iommu_table_group
> keeps pointers to those. The iommu_table structs are allocated
> dynamically.
> 
> For P5IOC2, both iommu_table_group and iommu_table are embedded into
> PE struct. As there is no EEH and SRIOV support for P5IOC2,
> iommu_free_table() should not be called on iommu_table struct pointers
> so we can keep it embedded in pnv_phb::p5ioc2.
> 
> For pSeries, this replaces multiple calls of kzalloc_node() with a new
> iommu_pseries_alloc_group() helper and stores the table group struct
> pointer into the pci_dn struct. For release, a iommu_table_free_group()
> helper is added.
> 
> This moves iommu_table struct allocation from SR-IOV code to
> the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
> pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
> This change is here because those lines had to be changed anyway.
> 
> This should cause no behavioural change.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v10:
> * new to the series, separated from
> "powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group"
> * iommu_table is not embedded into iommu_table_group but allocated
> dynamically in most cases
> * iommu_table allocation is moved to a single place for IODA2's
> pnv_pci_ioda_setup_dma_pe where it belongs to
> * added list of groups into iommu_table; most of the code just looks at
> the first item to keep the patch simpler
> ---
>  arch/powerpc/include/asm/iommu.h|  17 +++--
>  arch/powerpc/include/asm/pci-bridge.h   |   2 +-
>  arch/powerpc/kernel/iommu.c |  17 ++---
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  55 +++---
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  18 +++--
>  arch/powerpc/platforms/powernv/pci.h|   3 +-
>  arch/powerpc/platforms/pseries/iommu.c  | 107 
> +++-
>  drivers/vfio/vfio_iommu_spapr_tce.c |  23 +++---


For vfio:

Acked-by: Alex Williamson 


>  8 files changed, 152 insertions(+), 90 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index e2a45c3..61bde1a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -92,13 +92,10 @@ struct iommu_table {
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
>   unsigned long  it_page_shift;/* table iommu page size */
>  #ifdef CONFIG_IOMMU_API
> - struct iommu_group *it_group;
> + struct iommu_table_group *it_table_group;
>  #endif
>   struct iommu_table_ops *it_ops;
>   void (*set_bypass)(struct iommu_table *tbl, bool enable);
> -#ifdef CONFIG_PPC_POWERNV
> - void   *data;
> -#endif
>  };
>  
>  /* Pure 2^n version of get_order */
> @@ -130,13 +127,21 @@ extern void iommu_free_table(struct iommu_table *tbl, 
> const char *node_name);
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>   int nid);
>  #ifdef CONFIG_IOMMU_API
> -extern void iommu_register_group(struct iommu_table *tbl,
> +
> +#define IOMMU_TABLE_GROUP_MAX_TABLES 1
> +
> +struct iommu_table_group {
> + struct iommu_group *group;
> + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> +};
> +
> +extern void iommu_register_group(struct iommu_table_group *table_group,
>int pci_domain_number, unsigned long pe_num);
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
>  #else
> -static inline void iommu_register_group(struct iommu_table *tbl,
> +static inline void iommu_register_group(struct iommu_table_group 
> *table_group,
>   int pci_domain_number,
>   unsigned long pe_num)
>  {
> diff --git a/arch/powerpc/include/asm/pci-bridge.h 
> b/arch/powerpc/include/asm/pci

Re: [PATCH kernel v10 28/34] vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA windows API

2015-05-13 Thread Alex Williamson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> This extends iommu_table_group_ops by a set of callbacks to support
> dynamic DMA windows management.
> 
> create_table() creates a TCE table with specific parameters.
> it receives iommu_table_group to know nodeid in order to allocate
> TCE table memory closer to the PHB. The exact format of allocated
> multi-level table might be also specific to the PHB model (not
> the case now though).
> This callback calculated the DMA window offset on a PCI bus from @num
> and stores it in a just created table.
> 
> set_window() sets the window at specified TVT index + @num on PHB.
> 
> unset_window() unsets the window from specified TVT.
> 
> This adds a free() callback to iommu_table_ops to free the memory
> (potentially a tree of tables) allocated for the TCE table.
> 
> create_table() and free() are supposed to be called once per
> VFIO container and set_window()/unset_window() are supposed to be
> called for every group in a container.
> 
> This adds IOMMU capabilities to iommu_table_group such as default
> 32bit window parameters and others. This makes use of new values in
> vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
> advertise pagemasks to the userspace.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v10:
> * squashed "vfio: powerpc/spapr: Use 32bit DMA window properties from 
> table_group"
> into this
> * shortened the subject
> 
> v9:
> * new in the series - to make the next patch simpler
> ---
>  arch/powerpc/include/asm/iommu.h| 19 ++
>  arch/powerpc/platforms/powernv/pci-ioda.c   | 96 
> ++---
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |  7 ++-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++---
>  4 files changed, 124 insertions(+), 17 deletions(-)

For vfio:

Acked-by: Alex Williamson 

> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index a902159..2c41115 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -70,6 +70,7 @@ struct iommu_table_ops {
>   /* get() returns a physical address */
>   unsigned long (*get)(struct iommu_table *tbl, long index);
>   void (*flush)(struct iommu_table *tbl);
> + void (*free)(struct iommu_table *tbl);
>  };
>  
>  /* These are used by VIO */
> @@ -150,6 +151,17 @@ extern struct iommu_table *iommu_init_table(struct 
> iommu_table * tbl,
>  struct iommu_table_group;
>  
>  struct iommu_table_group_ops {
> + long (*create_table)(struct iommu_table_group *table_group,
> + int num,
> + __u32 page_shift,
> + __u64 window_size,
> + __u32 levels,
> + struct iommu_table **ptbl);
> + long (*set_window)(struct iommu_table_group *table_group,
> + int num,
> + struct iommu_table *tblnew);
> + long (*unset_window)(struct iommu_table_group *table_group,
> + int num);
>   /* Switch ownership from platform code to external user (e.g. VFIO) */
>   void (*take_ownership)(struct iommu_table_group *table_group);
>   /* Switch ownership from external user (e.g. VFIO) back to core */
> @@ -163,6 +175,13 @@ struct iommu_table_group_link {
>  };
>  
>  struct iommu_table_group {
> + /* IOMMU properties */
> + __u32 tce32_start;
> + __u32 tce32_size;
> + __u64 pgsizes; /* Bitmap of supported page sizes */
> + __u32 max_dynamic_windows_supported;
> + __u32 max_levels;
> +
>   struct iommu_group *group;
>   struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>   struct iommu_table_group_ops *ops;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index d2a1dcd..c1d1aef 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -1867,6 +1868,12 @@ static void pnv_ioda2_tce_free(struct iommu_table 
> *tbl, long index,
>   pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
>  }
>  
> +static void pnv_ioda2_table_free(struct iommu_table *tbl)
> +{
> + pnv_pci_ioda2_table_free_pages(tbl);
> + iommu_free_table(tbl, "pnv");
> +}
> +
>  static struct iommu_table_ops pnv_ioda2_iommu_ops = {
>   .set = pnv_ioda2_tce_build,
>  #ifdef CONFIG_IOMMU_API
> @@ -1874,6 +1881,7 @@ static struct iommu_table_ops

Re: [PATCH kernel v10 33/34] vfio: powerpc/spapr: Register memory and define IOMMU v2

2015-05-13 Thread Alex Williamson
On Tue, 2015-05-12 at 01:39 +1000, Alexey Kardashevskiy wrote:
> The existing implementation accounts the whole DMA window in
> the locked_vm counter. This is going to be worse with multiple
> containers and huge DMA windows. Also, real-time accounting would requite
> additional tracking of accounted pages due to the page size difference -
> IOMMU uses 4K pages and system uses 4K or 64K pages.
> 
> Another issue is that actual pages pinning/unpinning happens on every
> DMA map/unmap request. This does not affect the performance much now as
> we spend way too much time now on switching context between
> guest/userspace/host but this will start to matter when we add in-kernel
> DMA map/unmap acceleration.
> 
> This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
> New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
> 2 new ioctls to register/unregister DMA memory -
> VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
> which receive user space address and size of a memory region which
> needs to be pinned/unpinned and counted in locked_vm.
> New IOMMU splits physical pages pinning and TCE table update
> into 2 different operations. It requires:
> 1) guest pages to be registered first
> 2) consequent map/unmap requests to work only with pre-registered memory.
> For the default single window case this means that the entire guest
> (instead of 2GB) needs to be pinned before using VFIO.
> When a huge DMA window is added, no additional pinning will be
> required, otherwise it would be guest RAM + 2GB.
> 
> The new memory registration ioctls are not supported by
> VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
> will require memory to be preregistered in order to work.
> 
> The accounting is done per the user process.
> 
> This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
> can do with v1 or v2 IOMMUs.
> 
> In order to support memory pre-registration, we need a way to track
> the use of every registered memory region and only allow unregistration
> if a region is not in use anymore. So we need a way to tell from what
> region the just cleared TCE was from.
> 
> This adds a userspace view of the TCE table into iommu_table struct.
> It contains userspace address, one per TCE entry. The table is only
> allocated when the ownership over an IOMMU group is taken which means
> it is only used from outside of the powernv code (such as VFIO).
> 
> Signed-off-by: Alexey Kardashevskiy 
> [aw: for the vfio related changes]
> Acked-by: Alex Williamson 
> ---
> 
> Alex, should I remove your "acked-by" in the cases like this and
> get another one?


Generally if it's more than a trivial change, you'll want fresh acks.

> ---
> Changes:
> v10:
> * moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
> specific thing
> * squashed "powerpc/iommu: Add userspace view of TCE table" into this as
> it is
> a part of IOMMU v2
> * s/tce_iommu_use_page_v2/tce_iommu_prereg_ua_to_hpa/
> * fixed some function names to have "tce_iommu_" in the beginning rather
> just "tce_"
> * as mm_iommu_mapped_inc() can now fail, check for the return code
> 
> v9:
> * s/tce_get_hva_cached/tce_iommu_use_page_v2/
> 
> v7:
> * now memory is registered per mm (i.e. process)
> * moved memory registration code to powerpc/mmu
> * merged "vfio: powerpc/spapr: Define v2 IOMMU" into this
> * limited new ioctls to v2 IOMMU
> * updated doc
> * unsupported ioclts return -ENOTTY instead of -EPERM
> 
> v6:
> * tce_get_hva_cached() returns hva via a pointer
> 
> v4:
> * updated docs
> * s/kzmalloc/vzalloc/
> * in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
> replaced offset with index
> * renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
> and removed duplicating vfio_iommu_spapr_register_memory
> ---
>  Documentation/vfio.txt  |  31 ++-
>  arch/powerpc/include/asm/iommu.h|   6 +
>  drivers/vfio/vfio_iommu_spapr_tce.c | 516 
> ++--
>  include/uapi/linux/vfio.h   |  27 ++
>  4 files changed, 494 insertions(+), 86 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 96978ec..7dcf2b5 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>  
>  This implementation has some specifics:
>  
> -1) Only one IOMMU group per container is supported as an IOMMU group
> -represents the minimal entity which isolation can be guaranteed for and
> -groups are allocated statically, one per a Partitionable

Re: [PATCH kernel v10 05/34] powerpc/iommu: Always release iommu_table in iommu_free_table()

2015-05-13 Thread Alex Williamson
On Thu, 2015-05-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
> On 05/14/2015 09:27 AM, Gavin Shan wrote:
> > On Wed, May 13, 2015 at 02:51:36PM +0200, Thomas Huth wrote:
> >> On Wed, 13 May 2015 16:30:16 +1000
> >> Alexey Kardashevskiy  wrote:
> >>
> >>> On 05/13/2015 03:33 PM, Gavin Shan wrote:
>  On Tue, May 12, 2015 at 01:38:54AM +1000, Alexey Kardashevskiy wrote:
> > At the moment iommu_free_table() only releases memory if
> > the table was initialized for the platform code use, i.e. it had
> > it_map initialized (which purpose is to track DMA memory space use).
> >
> > With dynamic DMA windows, we will need to be able to release
> > iommu_table even if it was used for VFIO in which case it_map is NULL
> > so does the patch.
> >
> > Signed-off-by: Alexey Kardashevskiy 
> 
>  Reviewed-by: Gavin Shan 
> 
> > ---
> > arch/powerpc/kernel/iommu.c | 3 +--
> > 1 file changed, 1 insertion(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index 3d47eb3..2c02d4c 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, 
> > const char *node_name)
> > unsigned int order;
> >
> > if (!tbl || !tbl->it_map) {
> > -   printk(KERN_ERR "%s: expected TCE map for %s\n", 
> > __func__,
> > -   node_name);
> > +   kfree(tbl);
> 
>  I'm not sure if the "tbl" needs to be checked against NULL as kfree() 
>  already
>  has the check. But it looks a bit strange to free NULL "tbl" from the 
>  code
>  itself.
> >>>
> >>> Yeah, looks a bit weird, agree, I'll change but in general kfree/vfree/...
> >>> - they all check the passed pointer for NULL.
> >>
> >> But if tbl is NULL, the tbl->it_map check will fail, won't it? So in
> >> this case, I think you have to keep it.
> >>
> >
> > If I understood your question correctly, "tbl->it_map" won't be checked
> > when "tbl" is NULL because the connection ("||") for the two conditions.
> > The code can be changed to something like below if Alexey want:
> >
> > if (!tbl)
> > return;
> > if (!tbl->itmap)
> > kfree(tbl);
> 
> To be precise ;)
> 
> if (!tbl->itmap) {
>   kfree(tbl);
>   return;
> }

I hope that's not your solution, it clearly segfaults with a null
pointer de-ref if !tbl, which is apparently a concern down this path.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 03/13] powerpc/spapr: vfio: Implement spapr_tce_iommu_ops

2014-09-23 Thread Alex Williamson
On Tue, 2014-09-23 at 13:00 +1000, Alexey Kardashevskiy wrote:
> Modern IBM POWERPC systems support multiple IOMMU tables per PE
> so we need a more reliable way (compared to container_of()) to get
> a PE pointer from the iommu_table struct pointer used in IOMMU functions.
> 
> At the moment IOMMU group data points to an iommu_table struct. This
> introduces a spapr_tce_iommu_group struct which keeps an iommu_owner
> and a spapr_tce_iommu_ops struct. For IODA, iommu_owner is a pointer to
> the pnv_ioda_pe struct, for others it is still a pointer to
> the iommu_table struct. The ops structs correspond to the type which
> iommu_owner points to.
> 
> This defines a get_table() callback which returns an iommu_table
> by its number.
> 
> As the IOMMU group data pointer points to variable type instead of
> iommu_table, VFIO SPAPR TCE driver is updated to use the new type.
> This changes the tce_container struct to store iommu_group instead of
> iommu_table.
> 
> So, it was:
> - iommu_table points to iommu_group via iommu_table::it_group;
> - iommu_group points to iommu_table via iommu_group_get_iommudata();
> 
> now it is:
> - iommu_table points to iommu_group via iommu_table::it_group;
> - iommu_group points to spapr_tce_iommu_group via
> iommu_group_get_iommudata();
> - spapr_tce_iommu_group points to either (depending on .get_table()):
>   - iommu_table;
>   - pnv_ioda_pe;
> 
> This uses pnv_ioda1_iommu_get_table for both IODA1&2 but IODA2 will
> have own pnv_ioda2_iommu_get_table soon and pnv_ioda1_iommu_get_table
> will only be used for IODA1.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/iommu.h|   6 ++
>  arch/powerpc/include/asm/tce.h  |  13 +++
>  arch/powerpc/kernel/iommu.c |  35 ++-
>  arch/powerpc/platforms/powernv/pci-ioda.c   |  31 +-
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |   1 +
>  arch/powerpc/platforms/powernv/pci.c|   2 +-
>  arch/powerpc/platforms/pseries/iommu.c  |  10 +-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 148 
> ++--
>  8 files changed, 208 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index 42632c7..84ee339 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -108,13 +108,19 @@ extern void iommu_free_table(struct iommu_table *tbl, 
> const char *node_name);
>   */
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>   int nid);
> +
> +struct spapr_tce_iommu_ops;
>  #ifdef CONFIG_IOMMU_API
>  extern void iommu_register_group(struct iommu_table *tbl,
> +  void *iommu_owner,
> +  struct spapr_tce_iommu_ops *ops,
>int pci_domain_number, unsigned long pe_num);
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  #else
>  static inline void iommu_register_group(struct iommu_table *tbl,
> + void *iommu_owner,
> + struct spapr_tce_iommu_ops *ops,
>   int pci_domain_number,
>   unsigned long pe_num)
>  {
> diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
> index 743f36b..9f159eb 100644
> --- a/arch/powerpc/include/asm/tce.h
> +++ b/arch/powerpc/include/asm/tce.h
> @@ -50,5 +50,18 @@
>  #define TCE_PCI_READ 0x1 /* read from PCI allowed */
>  #define TCE_VB_WRITE 0x1 /* write from VB allowed */
>  
> +struct spapr_tce_iommu_group;
> +
> +struct spapr_tce_iommu_ops {
> + struct iommu_table *(*get_table)(
> + struct spapr_tce_iommu_group *data,
> + int num);
> +};
> +
> +struct spapr_tce_iommu_group {
> + void *iommu_owner;
> + struct spapr_tce_iommu_ops *ops;
> +};
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_TCE_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index b378f78..1c5dae7 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -878,24 +878,53 @@ void iommu_free_coherent(struct iommu_table *tbl, 
> size_t size,
>   */
>  static void group_release(void *iommu_data)
>  {
> - struct iommu_table *tbl = iommu_data;
> - tbl->it_group = NULL;
> + kfree(iommu_data);
>  }
>  
> +static struct iommu_table *spapr_tce_default_get_table(
> + struct spapr_tce_iommu_group *data, int num)
> +{
> + struct iommu_table *tbl = data->iommu_owner;
> +
> + switch (num) {
> + case 0:
> + if (tbl->it_size)
> + return tbl;
> + /* fallthru */
> + default:
> + return NULL;
> + }
> +}
> +
> +static struct spapr_tce_iommu_ops spapr_tce_

Re: [PATCH v2 04/13] powerpc/powernv: Convert/move set_bypass() callback to take_ownership()

2014-09-23 Thread Alex Williamson
On Tue, 2014-09-23 at 13:00 +1000, Alexey Kardashevskiy wrote:
> At the moment the iommu_table struct has a set_bypass() which enables/
> disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
> which calls this callback when external IOMMU users such as VFIO are
> about to get over a PHB.
> 
> Since the set_bypass() is not really an iommu_table function but PE's
> function, and we have an ops struct per IOMMU owner, let's move
> set_bypass() to the spapr_tce_iommu_ops struct.
> 
> As arch/powerpc/kernel/iommu.c is more about POWERPC IOMMU tables and
> has very little to do with PEs, this moves take_ownership() calls to
> the VFIO SPAPR TCE driver.
> 
> This renames set_bypass() to take_ownership() as it is not necessarily
> just enabling bypassing, it can be something else/more so let's give it
> a generic name. The bool parameter is inverted.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Reviewed-by: Gavin Shan 
> ---
>  arch/powerpc/include/asm/iommu.h  |  1 -
>  arch/powerpc/include/asm/tce.h|  2 ++
>  arch/powerpc/kernel/iommu.c   | 12 
>  arch/powerpc/platforms/powernv/pci-ioda.c | 20 
>  drivers/vfio/vfio_iommu_spapr_tce.c   | 16 
>  5 files changed, 30 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index 84ee339..2b0b01d 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -77,7 +77,6 @@ struct iommu_table {
>  #ifdef CONFIG_IOMMU_API
>   struct iommu_group *it_group;
>  #endif
> - void (*set_bypass)(struct iommu_table *tbl, bool enable);
>  };
>  
>  /* Pure 2^n version of get_order */
> diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h
> index 9f159eb..e6355f9 100644
> --- a/arch/powerpc/include/asm/tce.h
> +++ b/arch/powerpc/include/asm/tce.h
> @@ -56,6 +56,8 @@ struct spapr_tce_iommu_ops {
>   struct iommu_table *(*get_table)(
>   struct spapr_tce_iommu_group *data,
>   int num);
> + void (*take_ownership)(struct spapr_tce_iommu_group *data,
> + bool enable);

"set" is a better verb when using a bool to specify direction, imho.

This is pretty confusing now that we have

iommu_take_ownership()
data->ops->take_ownership(true)

iommu_release_ownership()
data->ops->take_ownership(false)

And there's zero comments here about what take_ownership is supposed to
provide, or get_table for that matter.

>  };
>  
>  struct spapr_tce_iommu_group {
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 1c5dae7..c2c8d9d 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -1139,14 +1139,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
>   memset(tbl->it_map, 0xff, sz);
>   iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
>  
> - /*
> -  * Disable iommu bypass, otherwise the user can DMA to all of
> -  * our physical memory via the bypass window instead of just
> -  * the pages that has been explicitly mapped into the iommu
> -  */
> - if (tbl->set_bypass)
> - tbl->set_bypass(tbl, false);
> -
>   return 0;
>  }
>  EXPORT_SYMBOL_GPL(iommu_take_ownership);
> @@ -1161,10 +1153,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
>   /* Restore bit#0 set by iommu_init_table() */
>   if (tbl->it_offset == 0)
>   set_bit(0, tbl->it_map);
> -
> - /* The kernel owns the device now, we can restore the iommu bypass */
> - if (tbl->set_bypass)
> - tbl->set_bypass(tbl, true);
>  }
>  EXPORT_SYMBOL_GPL(iommu_release_ownership);
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 2d32a1c..8cb2f31 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1105,10 +1105,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb 
> *phb,
>   __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
>  }
>  
> -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
> +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>  {
> - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
> -   tce32.table);
>   uint16_t window_id = (pe->pe_number << 1 ) + 1;
>   int64_t rc;
>  
> @@ -1136,7 +1134,7 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table 
> *tbl, bool enable)
>* host side.
>*/
>   if (pe->pdev)
> - set_iommu_table_base(&pe->pdev->dev, tbl);
> + set_iommu_table_base(&pe->pdev->dev, &pe->tce32.table);
>   else
>   pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
>   }
> @@ -1152,15 +1150,2

Re: [PATCH v2 13/13] vfio: powerpc/spapr: Enable Dynamic DMA windows

2014-09-23 Thread Alex Williamson
On Tue, 2014-09-23 at 13:01 +1000, Alexey Kardashevskiy wrote:
> This defines and implements VFIO IOMMU API which lets the userspace
> create and remove DMA windows.
> 
> This updates VFIO_IOMMU_SPAPR_TCE_GET_INFO to return the number of
> available windows and page mask.
> 
> This adds VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE
> to allow the user space to create and remove window(s).
> 
> The VFIO IOMMU driver does basic sanity checks and calls corresponding
> SPAPR TCE functions. At the moment only IODA2 (POWER8 PCI host bridge)
> implements them.
> 
> This advertises VFIO_IOMMU_SPAPR_TCE_FLAG_DDW capability via
> VFIO_IOMMU_SPAPR_TCE_GET_INFO.
> 
> This calls platform DDW reset() callback when IOMMU is being disabled
> to reset the DMA configuration to its original state.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 135 
> ++--
>  include/uapi/linux/vfio.h   |  25 ++-
>  2 files changed, 153 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 0dccbc4..b518891 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -190,18 +190,25 @@ static void tce_iommu_disable(struct tce_container 
> *container)
>  
>   container->enabled = false;
>  
> - if (!container->grp || !current->mm)
> + if (!container->grp)
>   return;
>  
>   data = iommu_group_get_iommudata(container->grp);
>   if (!data || !data->iommu_owner || !data->ops->get_table)
>   return;
>  
> - tbl = data->ops->get_table(data, 0);
> - if (!tbl)
> - return;
> + if (current->mm) {
> + tbl = data->ops->get_table(data, 0);
> + if (tbl)
> + decrement_locked_vm(tbl);
>  
> - decrement_locked_vm(tbl);
> + tbl = data->ops->get_table(data, 1);
> + if (tbl)
> + decrement_locked_vm(tbl);
> + }
> +
> + if (data->ops->reset)
> + data->ops->reset(data);
>  }
>  
>  static void *tce_iommu_open(unsigned long arg)
> @@ -243,7 +250,7 @@ static long tce_iommu_ioctl(void *iommu_data,
>unsigned int cmd, unsigned long arg)
>  {
>   struct tce_container *container = iommu_data;
> - unsigned long minsz;
> + unsigned long minsz, ddwsz;
>   long ret;
>  
>   switch (cmd) {
> @@ -288,6 +295,28 @@ static long tce_iommu_ioctl(void *iommu_data,
>   info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
>   info.flags = 0;
>  
> + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info,
> + page_size_mask);
> +
> + if (info.argsz == ddwsz) {

>=

> + if (data->ops->query && data->ops->create &&
> + data->ops->remove) {
> + info.flags |= VFIO_IOMMU_SPAPR_TCE_FLAG_DDW;

I think you want to set this flag regardless of whether the user has
provided space for it.  A valid use model is to call with the minimum
size and look at the flags to determine if it needs to be called again
with a larger size.

> +
> + ret = data->ops->query(data,
> + &info.current_windows,
> + &info.windows_available,
> + &info.page_size_mask);
> + if (ret)
> + return ret;
> + } else {
> + info.current_windows = 0;
> + info.windows_available = 0;
> + info.page_size_mask = 0;
> + }
> + minsz = ddwsz;

It's not really any longer the min size, is it?

> + }
> +
>   if (copy_to_user((void __user *)arg, &info, minsz))
>   return -EFAULT;
>  
> @@ -412,12 +441,106 @@ static long tce_iommu_ioctl(void *iommu_data,
>   tce_iommu_disable(container);
>   mutex_unlock(&container->lock);
>   return 0;
> +
>   case VFIO_EEH_PE_OP:
>   if (!container->grp)
>   return -ENODEV;
>  
>   return vfio_spapr_iommu_eeh_ioctl(container->grp,
> cmd, arg);
> +
> + case VFIO_IOMMU_SPAPR_TCE_CREATE: {
> + struct vfio_iommu_spapr_tce_create create;
> + struct spapr_tce_iommu_group *data;
> + struct iommu_table *tbl;
> +
> + if (WARN_ON(!container->grp))

redux previous comment on this warning

> + return -ENXIO;
> +
> + data = iommu_group_get_iommudata(container->grp);
> +
> + minsz = offsetofend(

Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alex Williamson
On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
> From: Alexey Kardashevskiy 
> 
> The IOMMU API implements groups creating/deletion, device binding
> and IOMMU map/unmap operations.
> 
> The PowerPC implementation uses most of the API except map/unmap
> operations, which are implemented on POWER using hypercalls.
> 
> However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
> the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
> defined, so this defines them.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Cc: David Gibson 
> Signed-off-by: Paul Mackerras 
> ---
>  arch/powerpc/include/asm/kvm_host.h |   14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index b6a047e..c025d91 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
>  
>  #define __KVM_HAVE_ARCH_WQP
>  
> +#ifdef CONFIG_IOMMU_API
> +/* POWERPC does not use IOMMU API for mapping/unmapping */
> +static inline int kvm_iommu_map_pages(struct kvm *kvm,
> + struct kvm_memory_slot *slot)
> +{
> + return 0;
> +}
> +
> +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
> + struct kvm_memory_slot *slot)
> +{
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
>  #endif /* __POWERPC_KVM_HOST_H__ */

This is no longer needed, Gleb applied my patch for 3.10 that make all
of KVM device assignment dependent on a build config option and the top
level kvm_host.h now includes this when that is not set.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alex Williamson
On Tue, 2013-05-07 at 10:49 +1000, Alexey Kardashevskiy wrote:
> On 05/07/2013 07:07 AM, Alex Williamson wrote:
> > On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
> >> From: Alexey Kardashevskiy 
> >>
> >> The IOMMU API implements groups creating/deletion, device binding
> >> and IOMMU map/unmap operations.
> >>
> >> The PowerPC implementation uses most of the API except map/unmap
> >> operations, which are implemented on POWER using hypercalls.
> >>
> >> However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
> >> the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
> >> defined, so this defines them.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> Cc: David Gibson 
> >> Signed-off-by: Paul Mackerras 
> >> ---
> >>  arch/powerpc/include/asm/kvm_host.h |   14 ++
> >>  1 file changed, 14 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> >> b/arch/powerpc/include/asm/kvm_host.h
> >> index b6a047e..c025d91 100644
> >> --- a/arch/powerpc/include/asm/kvm_host.h
> >> +++ b/arch/powerpc/include/asm/kvm_host.h
> >> @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
> >>  
> >>  #define __KVM_HAVE_ARCH_WQP
> >>  
> >> +#ifdef CONFIG_IOMMU_API
> >> +/* POWERPC does not use IOMMU API for mapping/unmapping */
> >> +static inline int kvm_iommu_map_pages(struct kvm *kvm,
> >> +  struct kvm_memory_slot *slot)
> >> +{
> >> +  return 0;
> >> +}
> >> +
> >> +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
> >> +  struct kvm_memory_slot *slot)
> >> +{
> >> +}
> >> +#endif /* CONFIG_IOMMU_API */
> >> +
> >>  #endif /* __POWERPC_KVM_HOST_H__ */
> > 
> > This is no longer needed, Gleb applied my patch for 3.10 that make all
> > of KVM device assignment dependent on a build config option and the top
> > level kvm_host.h now includes this when that is not set.  Thanks,
> 
> Cannot find it, could you point me please where it is on github or
> git.kernel.org? Thanks.

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a5bab1004729f3302c776e53ee7c895b98bb1ce

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/5 v2] VFIO PPC64: add VFIO support on POWERPC64

2013-05-23 Thread Alex Williamson
On Tue, 2013-05-21 at 13:33 +1000, Alexey Kardashevskiy wrote:
> The series adds support for VFIO on POWERPC in user space (such as QEMU).
> The in-kernel real mode IOMMU support is added by another series posted
> separately.
> 
> As the first and main aim of this series is the POWERNV platform support,
> the "Enable on POWERNV platform" patch goes first and introduces an API
> to be used by the VFIO IOMMU driver. The "Enable on pSeries platform" patch
> simply registers PHBs in the IOMMU subsystem and expects the API to be 
> present,
> it enables VFIO support in fully emulated QEMU guests.
> 
> The main change is that this series was changed and tested against v3.10-rc1.
> It also contains some bugfixes which are mentioned (if any) in the patch 
> messages.
> 
> Alexey Kardashevskiy (3):
>   powerpc/vfio: Enable on POWERNV platform
>   powerpc/vfio: Implement IOMMU driver for VFIO
>   powerpc/vfio: Enable on pSeries platform
> 
>  Documentation/vfio.txt  |   63 +
>  arch/powerpc/include/asm/iommu.h|   26 ++
>  arch/powerpc/kernel/iommu.c |  323 +++
>  arch/powerpc/platforms/powernv/pci-ioda.c   |1 +
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +-
>  arch/powerpc/platforms/powernv/pci.c|2 +
>  arch/powerpc/platforms/pseries/iommu.c  |4 +
>  drivers/iommu/Kconfig   |8 +
>  drivers/vfio/Kconfig|6 +
>  drivers/vfio/Makefile   |1 +
>  drivers/vfio/vfio.c |1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  377 
> +++
>  include/uapi/linux/vfio.h   |   34 +++
>  13 files changed, 850 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 

These look ok to me, how do you want to integrate them?  Should I
provide Acks on patches 2 & 3 and let them get pushed through the ppc
tree or should I wait for patch 1 then push 2 & 3 through my tree?
Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/3] powerpc/vfio: Implement IOMMU driver for VFIO

2013-05-24 Thread Alex Williamson
On Tue, 2013-05-21 at 13:33 +1000, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling.  This implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWER
> guest).
> 
> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 
> Signed-off-by: Paul Mackerras 

Acked-by: Alex Williamson 

> ---
>  Documentation/vfio.txt  |   63 ++
>  drivers/vfio/Kconfig|6 +
>  drivers/vfio/Makefile   |1 +
>  drivers/vfio/vfio.c |1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  377 
> +++
>  include/uapi/linux/vfio.h   |   34 
>  6 files changed, 482 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 8eda363..c55533c 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The 
> read/write/mmap
>  interfaces implement the device region access defined by the device's
>  own VFIO_DEVICE_GET_REGION_INFO ioctl.
>  
> +
> +PPC64 sPAPR implementation note
> +---
> +
> +This implementation has some specifics:
> +
> +1) Only one IOMMU group per container is supported as an IOMMU group
> +represents the minimal entity which isolation can be guaranteed for and
> +groups are allocated statically, one per a Partitionable Endpoint (PE)
> +(PE is often a PCI domain but not always).
> +
> +2) The hardware supports so called DMA windows - the PCI address range
> +within which DMA transfer is allowed, any attempt to access address space
> +out of the window leads to the whole PE isolation.
> +
> +3) PPC64 guests are paravirtualized but not fully emulated. There is an API
> +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
> +currently there is no way to reduce the number of calls. In order to make 
> things
> +faster, the map/unmap handling has been implemented in real mode which 
> provides
> +an excellent performance which has limitations such as inability to do
> +locked pages accounting in real time.
> +
> +So 3 additional ioctls have been added:
> +
> + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
> + of the DMA window on the PCI bus.
> +
> + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
> + is done at this point. This lets user first to know what
> + the DMA window is and adjust rlimit before doing any real job.
> +
> + VFIO_IOMMU_DISABLE - disables the container.
> +
> +
> +The code flow from the example above should be slightly changed:
> +
> + .
> + /* Add the group to the container */
> + ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
> +
> + /* Enable the IOMMU model we want */
> + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
> +
> + /* Get addition sPAPR IOMMU info */
> + vfio_iommu_spapr_tce_info spapr_iommu_info;
> + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &spapr_iommu_info);
> +
> + if (ioctl(container, VFIO_IOMMU_ENABLE))
> + /* Cannot enable container, may be low rlimit */
> +
> + /* Allocate some space and setup a DMA mapping */
> + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
> +  MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +
> + dma_map.size = 1024 * 1024;
> + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
> + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
> +
> + /* Check here is .iova/.size are within DMA window from 
> spapr_iommu_info */
> +
> + ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> + .
> +
>  
> ---
>  
>  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>   depends on VFIO
>   default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IO

Re: [PATCH 3/3] powerpc/vfio: Enable on pSeries platform

2013-05-24 Thread Alex Williamson
On Tue, 2013-05-21 at 13:33 +1000, Alexey Kardashevskiy wrote:
> The enables VFIO on the pSeries platform, enabling user space
> programs to access PCI devices directly.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Cc: David Gibson 
> Signed-off-by: Paul Mackerras 

Acked-by: Alex Williamson 

> ---
>  arch/powerpc/platforms/pseries/iommu.c |4 
>  drivers/iommu/Kconfig  |2 +-
>  drivers/vfio/Kconfig   |2 +-
>  3 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> b/arch/powerpc/platforms/pseries/iommu.c
> index 86ae364..23fc1dc 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -614,6 +614,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>  
>   iommu_table_setparms(pci->phb, dn, tbl);
>   pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> + iommu_register_group(tbl, pci_domain_nr(bus), 0);
>  
>   /* Divide the rest (1.75GB) among the children */
>   pci->phb->dma_window_size = 0x8000ul;
> @@ -658,6 +659,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus 
> *bus)
>  ppci->phb->node);
>   iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
>   ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
> + iommu_register_group(tbl, pci_domain_nr(bus), 0);
>   pr_debug("  created table: %p\n", ppci->iommu_table);
>   }
>  }
> @@ -684,6 +686,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>  phb->node);
>   iommu_table_setparms(phb, dn, tbl);
>   PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
> + iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
>   set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
>   return;
>   }
> @@ -1184,6 +1187,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
> *dev)
>  pci->phb->node);
>   iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
>   pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
> + iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
>   pr_debug("  created table: %p\n", pci->iommu_table);
>   } else {
>   pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 3f3abde..01730b2 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -263,7 +263,7 @@ config SHMOBILE_IOMMU_L1SIZE
>  
>  config SPAPR_TCE_IOMMU
>   bool "sPAPR TCE IOMMU Support"
> - depends on PPC_POWERNV
> + depends on PPC_POWERNV || PPC_PSERIES
>   select IOMMU_API
>   help
> Enables bits of IOMMU API required by VFIO. The iommu_ops
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index b464687..26b3d9d 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -12,7 +12,7 @@ menuconfig VFIO
>   tristate "VFIO Non-Privileged userspace driver framework"
>   depends on IOMMU_API
>   select VFIO_IOMMU_TYPE1 if X86
> - select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
>   help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-16 Thread Alex Williamson
On Mon, 2013-06-17 at 08:39 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2013-06-05 at 16:11 +1000, Alexey Kardashevskiy wrote:
> > +long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
> > +   struct kvm_create_spapr_tce_iommu *args)
> > +{
> > +   struct kvmppc_spapr_tce_table *tt = NULL;
> > +   struct iommu_group *grp;
> > +   struct iommu_table *tbl;
> > +
> > +   /* Find an IOMMU table for the given ID */
> > +   grp = iommu_group_get_by_id(args->iommu_id);
> > +   if (!grp)
> > +   return -ENXIO;
> > +
> > +   tbl = iommu_group_get_iommudata(grp);
> > +   if (!tbl)
> > +   return -ENXIO;
> 
> So Alex Graf pointed out here, there is a security issue here, or are we
> missing something ?
> 
> What prevents a malicious program that has access to /dev/kvm from
> taking over random iommu groups (including host used ones) that way?
> 
> What is the security model of that whole iommu stuff to begin with ?

IOMMU groups themselves don't provide security, they're accessed by
interfaces like VFIO, which provide the security.  Given a brief look, I
agree, this looks like a possible backdoor.  The typical VFIO way to
handle this would be to pass a VFIO file descriptor here to prove that
the process has access to the IOMMU group.  This is how /dev/vfio/vfio
gains the ability to setup an IOMMU domain an do mappings with the
SET_CONTAINER ioctl using a group fd.  Thanks,

Alex



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-17 Thread Alex Williamson
On Mon, 2013-06-17 at 13:56 +1000, Benjamin Herrenschmidt wrote:
> On Sun, 2013-06-16 at 21:13 -0600, Alex Williamson wrote:
> 
> > IOMMU groups themselves don't provide security, they're accessed by
> > interfaces like VFIO, which provide the security.  Given a brief look, I
> > agree, this looks like a possible backdoor.  The typical VFIO way to
> > handle this would be to pass a VFIO file descriptor here to prove that
> > the process has access to the IOMMU group.  This is how /dev/vfio/vfio
> > gains the ability to setup an IOMMU domain an do mappings with the
> > SET_CONTAINER ioctl using a group fd.  Thanks,
> 
> How do you envision that in the kernel ? IE. I'm in KVM code, gets that
> vfio fd, what do I do with it ?
> 
> Basically, KVM needs to know that the user is allowed to use that iommu
> group. I don't think we want KVM however to call into VFIO directly
> right ?

Right, we don't want to create dependencies across modules.  I don't
have a vision for how this should work.  This is effectively a complete
side-band to vfio, so we're really just dealing in the iommu group
space.  Maybe there needs to be some kind of registration of ownership
for the group using some kind of token.  It would need to include some
kind of notification when that ownership ends.  That might also be a
convenient tag to toggle driver probing off for devices in the group.
Other ideas?  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-18 Thread Alex Williamson
On Tue, 2013-06-18 at 14:38 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2013-06-17 at 20:32 -0600, Alex Williamson wrote:
> 
> > Right, we don't want to create dependencies across modules.  I don't
> > have a vision for how this should work.  This is effectively a complete
> > side-band to vfio, so we're really just dealing in the iommu group
> > space.  Maybe there needs to be some kind of registration of ownership
> > for the group using some kind of token.  It would need to include some
> > kind of notification when that ownership ends.  That might also be a
> > convenient tag to toggle driver probing off for devices in the group.
> > Other ideas?  Thanks,
> 
> All of that smells nasty like it will need a pile of bloody
> infrastructure which makes me think it's too complicated and not the
> right approach.
> 
> How does access control work today on x86/VFIO ? Can you give me a bit
> more details ? I didn't get a good grasp in your previous email

The current model is not x86 specific, but it only covers doing iommu
and device access through vfio.  The kink here is that we're trying to
do device access and setup through vfio, but iommu manipulation through
kvm.  We may want to revisit whether we can do the in-kernel iommu
manipulation through vfio rather than kvm.

For vfio in general, the group is the unit of ownership.  A user is
granted access to /dev/vfio/$GROUP through file permissions.  The user
opens the group and a container (/dev/vfio/vfio) and calls SET_CONTAINER
on the group.  If supported by the platform, multiple groups can be set
to the same container, which allows for iommu domain sharing.  Once a
group is associated with a container, an iommu backend can be
initialized for the container.  Only then can a device be accessed
through the group.

So even if we were to pass a vfio group file descriptor into kvm and it
matched as some kind of ownership token on the iommu group, it's not
clear that's sufficient to assume we can start programming the iommu.
Thanks,

Alex

> From the look of it, the VFIO file descriptor is what has the "access
> control" to the underlying iommu, is this right ? So we somewhat need to
> transfer (or copy) that ownership from the VFIO fd to the KVM VM.
> 
> I don't see a way to do that without some cross-layering here...
> 
> Rusty, are you aware of some kernel mechanism we can use for that ?
> 
> Cheers,
> Ben.
> 
> 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-19 Thread Alex Williamson
On Thu, 2013-06-20 at 00:50 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2013-06-19 at 11:58 +0200, Alexander Graf wrote:
> 
> > > Alex, any objection ?
> > 
> > Which Alex? :)
> 
> Heh, mostly Williamson in this specific case but your input is still
> welcome :-)
> 
> > I think validate works, it keeps iteration logic out of the kernel
> > which is a good thing. There still needs to be an interface for
> > getting the iommu id in VFIO, but I suppose that one's for the other
> > Alex and Jörg to comment on.
> 
> I think getting the iommu fd is already covered by separate patches from
> Alexey.
> 
> > > 
> > > Do we need to make it a get/put interface instead ?
> > > 
> > >   vfio_validate_and_use_iommu(file, iommu_id);
> > > 
> > >   vfio_release_iommu(file, iommu_id);
> > > 
> > > To ensure that the resource remains owned by the process until KVM
> > > is closed as well ?
> > > 
> > > Or do we want to register with VFIO with a callback so that VFIO can
> > > call us if it needs us to give it up ?
> > 
> > Can't we just register a handler on the fd and get notified when it
> > closes? Can you kill VFIO access without closing the fd?
> 
> That sounds actually harder :-)
> 
> The question is basically: When we validate that relationship between a
> specific VFIO struct file with an iommu, what is the lifetime of that
> and how do we handle this lifetime properly.
> 
> There's two ways for that sort of situation: The notification model
> where we get notified when the relationship is broken, and the refcount
> model where we become a "user" and thus delay the breaking of the
> relationship until we have been disposed of as well.
> 
> In this specific case, it's hard to tell what is the right model from my
> perspective, which is why I would welcome Alex (W.) input.
> 
> In the end, the solution will end up being in the form of APIs exposed
> by VFIO for use by KVM (via that symbol lookup mechanism) so Alex (W),
> as owner of VFIO at this stage, what do you want those to look
> like ? :-)

My first thought is that we should use the same reference counting as we
have for vfio devices (group->container_users).  An interface for that
might look like:

int vfio_group_add_external_user(struct file *filep)
{
struct vfio_group *group = filep->private_data;

if (filep->f_op != &vfio_group_fops)
return -EINVAL;


if (!atomic_inc_not_zero(&group->container_users))
return -EINVAL;

return 0;
}

void vfio_group_del_external_user(struct file *filep)
{
struct vfio_group *group = filep->private_data;

BUG_ON(filep->f_op != &vfio_group_fops);

vfio_group_try_dissolve_container(group);
}

int vfio_group_iommu_id_from_file(struct file *filep)
{
struct vfio_group *group = filep->private_data;

BUG_ON(filep->f_op != &vfio_group_fops);

return iommu_group_id(group->iommu_group);
}

Would that work?  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-20 Thread Alex Williamson
On Thu, 2013-06-20 at 18:48 +1000, Alexey Kardashevskiy wrote:
> On 06/20/2013 05:47 PM, Benjamin Herrenschmidt wrote:
> > On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
> >>> Just out of curiosity - would not get_file() and fput_atomic() on a
> >> group's
> >>> file* do the right job instead of vfio_group_add_external_user() and
> >>> vfio_group_del_external_user()?
> >>
> >> I was thinking that too.  Grabbing a file reference would certainly be
> >> the usual way of handling this sort of thing.
> > 
> > But that wouldn't prevent the group ownership to be returned to
> > the kernel or another user would it ?
> 
> 
> Holding the file pointer does not let the group->container_users counter go
> to zero

How so?  Holding the file pointer means the file won't go away, which
means the group release function won't be called.  That means the group
won't go away, but that doesn't mean it's attached to an IOMMU.  A user
could call UNSET_CONTAINER.

>  and this is exactly what vfio_group_add_external_user() and
> vfio_group_del_external_user() do. The difference is only in absolute value
> - 2 vs. 3.
> 
> No change in behaviour whether I use new vfio API or simply hold file* till
> KVM closes fd created when IOMMU was connected to LIOBN.

By that notion you could open(/dev/vfio/$GROUP) and you're safe, right?
But what about SET_CONTAINER & SET_IOMMU?  All that you guarantee
holding the file pointer is that the vfio_group exists.

> And while this counter is not zero, QEMU cannot take ownership over the group.
>
> I am definitely still missing the bigger picture...

The bigger picture is that the group needs to exist AND it needs to be
setup and maintained to have IOMMU protection.  Actually, my first stab
at add_external_user doesn't look sufficient, it needs to look more like
vfio_group_get_device_fd, checking group->container->iommu and
group_viable().  As written it would allow an external user after
SET_CONTAINER without SET_IOMMU.  It should also be part of the API that
the external user must hold the file reference between add_external_use
and del_external_user and do cleanup on any exit paths.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-22 Thread Alex Williamson
On Sat, 2013-06-22 at 22:03 +1000, David Gibson wrote:
> On Thu, Jun 20, 2013 at 08:55:13AM -0600, Alex Williamson wrote:
> > On Thu, 2013-06-20 at 18:48 +1000, Alexey Kardashevskiy wrote:
> > > On 06/20/2013 05:47 PM, Benjamin Herrenschmidt wrote:
> > > > On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
> > > >>> Just out of curiosity - would not get_file() and fput_atomic() on a
> > > >> group's
> > > >>> file* do the right job instead of vfio_group_add_external_user() and
> > > >>> vfio_group_del_external_user()?
> > > >>
> > > >> I was thinking that too.  Grabbing a file reference would certainly be
> > > >> the usual way of handling this sort of thing.
> > > > 
> > > > But that wouldn't prevent the group ownership to be returned to
> > > > the kernel or another user would it ?
> > > 
> > > 
> > > Holding the file pointer does not let the group->container_users counter 
> > > go
> > > to zero
> > 
> > How so?  Holding the file pointer means the file won't go away, which
> > means the group release function won't be called.  That means the group
> > won't go away, but that doesn't mean it's attached to an IOMMU.  A user
> > could call UNSET_CONTAINER.
> 
> Uhh... *thinks*.  Ah, I see.
> 
> I think the interface should not take the group fd, but the container
> fd.  Holding a reference to *that* would keep the necessary things
> around.  But more to the point, it's the right thing semantically:
> 
> The container is essentially the handle on a host iommu address space,
> and so that's what should be bound by the KVM call to a particular
> guest iommu address space.  e.g. it would make no sense to bind two
> different groups to different guest iommu address spaces, if they were
> in the same container - the guest thinks they are different spaces,
> but if they're in the same container they must be the same space.

While the container is the gateway to the iommu, what empowers the
container to maintain an iommu is the group.  What happens to a
container when all the groups are disconnected or closed?  Groups are
the unit that indicates hardware access, not containers.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-23 Thread Alex Williamson
On Mon, 2013-06-24 at 13:52 +1000, David Gibson wrote:
> On Sat, Jun 22, 2013 at 08:28:06AM -0600, Alex Williamson wrote:
> > On Sat, 2013-06-22 at 22:03 +1000, David Gibson wrote:
> > > On Thu, Jun 20, 2013 at 08:55:13AM -0600, Alex Williamson wrote:
> > > > On Thu, 2013-06-20 at 18:48 +1000, Alexey Kardashevskiy wrote:
> > > > > On 06/20/2013 05:47 PM, Benjamin Herrenschmidt wrote:
> > > > > > On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
> > > > > >>> Just out of curiosity - would not get_file() and fput_atomic() on 
> > > > > >>> a
> > > > > >> group's
> > > > > >>> file* do the right job instead of vfio_group_add_external_user() 
> > > > > >>> and
> > > > > >>> vfio_group_del_external_user()?
> > > > > >>
> > > > > >> I was thinking that too.  Grabbing a file reference would 
> > > > > >> certainly be
> > > > > >> the usual way of handling this sort of thing.
> > > > > > 
> > > > > > But that wouldn't prevent the group ownership to be returned to
> > > > > > the kernel or another user would it ?
> > > > > 
> > > > > 
> > > > > Holding the file pointer does not let the group->container_users 
> > > > > counter go
> > > > > to zero
> > > > 
> > > > How so?  Holding the file pointer means the file won't go away, which
> > > > means the group release function won't be called.  That means the group
> > > > won't go away, but that doesn't mean it's attached to an IOMMU.  A user
> > > > could call UNSET_CONTAINER.
> > > 
> > > Uhh... *thinks*.  Ah, I see.
> > > 
> > > I think the interface should not take the group fd, but the container
> > > fd.  Holding a reference to *that* would keep the necessary things
> > > around.  But more to the point, it's the right thing semantically:
> > > 
> > > The container is essentially the handle on a host iommu address space,
> > > and so that's what should be bound by the KVM call to a particular
> > > guest iommu address space.  e.g. it would make no sense to bind two
> > > different groups to different guest iommu address spaces, if they were
> > > in the same container - the guest thinks they are different spaces,
> > > but if they're in the same container they must be the same space.
> > 
> > While the container is the gateway to the iommu, what empowers the
> > container to maintain an iommu is the group.  What happens to a
> > container when all the groups are disconnected or closed?  Groups are
> > the unit that indicates hardware access, not containers.  Thanks,
> 
> Uh... huh?  I'm really not sure what you're getting at.
> 
> The operation we're doing for KVM here is binding a guest iommu
> address space to a particular host iommu address space.  Why would we
> not want to use the obvious handle on the host iommu address space,
> which is the container fd?

AIUI, the request isn't for an interface through which to do iommu
mappings.  The request is for an interface to show that the user has
sufficient privileges to do mappings.  Groups are what gives the user
that ability.  The iommu is also possibly associated with multiple iommu
groups and I believe what is being asked for here is a way to hold and
lock a single iommu group with iommu protection.

>From a practical point of view, the iommu interface is de-privileged
once the groups are disconnected or closed.  Holding a reference count
on the iommu fd won't prevent that.  That means we'd have to use a
notifier to have KVM stop the side-channel iommu access.  Meanwhile
holding the file descriptor for the group and adding an interface that
bumps use counter allows KVM to lock itself in, just as if it had a
device opened itself.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3 v16] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-06-24 Thread Alex Williamson
On Thu, 2013-06-20 at 21:31 +0530, Varun Sethi wrote:

> +#define REQ_ACS_FLAGS(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | 
> PCI_ACS_UF)
> +
> +static struct iommu_group *get_device_iommu_group(struct device *dev)
> +{
> + struct iommu_group *group;
> +
> + group = iommu_group_get(dev);
> + if (!group)
> + group = iommu_group_alloc();
> +
> + return group;
> +}
> +
[snip]
> +

This really gets parent or peer, right?

> +static struct iommu_group *get_peer_pci_device_group(struct pci_dev *pdev)
> +{
> + struct iommu_group *group = NULL;
> +
> + /* check if this is the first device on the bus*/
> + if (pdev->bus_list.next == pdev->bus_list.prev) {

It's a list_head, use list functions.  The list implementation should be
treated as opaque.

if (list_is_singular(&pdev->bus_list))

> + struct pci_bus *bus = pdev->bus->parent;
> + /* Traverese the parent bus list to get
> +  * pdev & dev for the sibling device.
> +  */
> + while (bus) {
> + if (!list_empty(&bus->devices)) {
> + pdev = container_of(bus->devices.next,
> + struct pci_dev, bus_list);

pdev = list_first_entry(&bus->devices, struct pci_dev, bus_list);

> + group = iommu_group_get(&pdev->dev);
> + break;
> + } else
> + bus = bus->parent;

Is this ever reached?  Don't you always have bus->self?

> + }
> + } else {
> + /*
> +  * Get the pdev & dev for the sibling device
> +  */
> + pdev = container_of(pdev->bus_list.prev,
> + struct pci_dev, bus_list);

How do you know if you're at the head or tail of the list?

struct pci_dev *tmp;
list_for_each_entry(tmp, &pdev->bus_list, bus_list) {
if (tmp == pdev)
continue;

group = iommu_group_get(&tmp->dev);
break;
}

> + group = iommu_group_get(&pdev->dev);
> + }
> +
> + return group;
> +}
> +
> +static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
> +{
> + struct iommu_group *group = NULL;
> + struct pci_dev *bridge, *dma_pdev = NULL;
> + struct pci_controller *pci_ctl;
> + bool pci_endpt_partioning;
> +
> + pci_ctl = pci_bus_to_host(pdev->bus);
> + pci_endpt_partioning = check_pci_ctl_endpt_part(pci_ctl);
> + /* We can partition PCIe devices so assign device group to the device */
> + if (pci_endpt_partioning) {
> + bridge = pci_find_upstream_pcie_bridge(pdev);
> + if (bridge) {
> + if (pci_is_pcie(bridge))
> + dma_pdev = pci_get_domain_bus_and_slot(
> + pci_domain_nr(pdev->bus),
> + bridge->subordinate->number, 0);
> + if (!dma_pdev)
> + dma_pdev = pci_dev_get(bridge);
> + } else
> + dma_pdev = pci_dev_get(pdev);
> +
> + /* Account for quirked devices */
> + swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
> +
> + /*
> +  * If it's a multifunction device that does not support our
> +  * required ACS flags, add to the same group as function 0.
> +  */

See c14d2690 in Joerg's next tree, using function 0 was a poor
assumption.

> + if (dma_pdev->multifunction &&
> + !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS))
> + swap_pci_ref(&dma_pdev,
> +  pci_get_slot(dma_pdev->bus,
> +   
> PCI_DEVFN(PCI_SLOT(dma_pdev->devfn),
> +   0)));
> +
> + group = get_device_iommu_group(&pdev->dev);
> + pci_dev_put(pdev);

What was the point of all the above if we use pdev here instead of
dma_pdev?  Wrong device and broken reference counting.  This also isn't
testing ACS all the way up to the root complex or controller.

> + /*
> +  * PCIe controller is not a paritionable entity
> +  * free the controller device iommu_group.
> +  */
> + if (pci_ctl->parent->iommu_group)
> + iommu_group_remove_device(pci_ctl->parent);
> + } else {
> + /*
> +  * All devices connected to the controller will share the
> +  * PCI controllers device group. If this is the first
> +  * device to be probed for the pci controller, copy the
> +  * device group information from the PCI controller device
> +  * node and remove the PCI controller iommu group.
> +  * For subsequent devices, the iommu g

Re: [PATCH 3/3 v16] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-06-26 Thread Alex Williamson
On Wed, 2013-06-26 at 06:24 +, Sethi Varun-B16395 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Tuesday, June 25, 2013 10:27 AM
> > To: Sethi Varun-B16395
> > Cc: j...@8bytes.org; io...@lists.linux-foundation.org; linuxppc-
> > d...@lists.ozlabs.org; linux-ker...@vger.kernel.org;
> > b...@kernel.crashing.org; ga...@kernel.crashing.org; Yoder Stuart-B08248;
> > Wood Scott-B07421; Timur Tabi
> > Subject: Re: [PATCH 3/3 v16] iommu/fsl: Freescale PAMU driver and iommu
> > implementation.
> > 
> > On Thu, 2013-06-20 at 21:31 +0530, Varun Sethi wrote:
> > 
> > > +#define REQ_ACS_FLAGS(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR |
> > PCI_ACS_UF)
> > > +
> > > +static struct iommu_group *get_device_iommu_group(struct device *dev)
> > > +{
> > > + struct iommu_group *group;
> > > +
> > > + group = iommu_group_get(dev);
> > > + if (!group)
> > > + group = iommu_group_alloc();
> > > +
> > > + return group;
> > > +}
> > > +
> > [snip]
> > > +
> > 
> > This really gets parent or peer, right?
> > 
> > > +static struct iommu_group *get_peer_pci_device_group(struct pci_dev
> > > +*pdev) {
> > > + struct iommu_group *group = NULL;
> > > +
> > > + /* check if this is the first device on the bus*/
> > > + if (pdev->bus_list.next == pdev->bus_list.prev) {
> > 
> > It's a list_head, use list functions.  The list implementation should be
> > treated as opaque.
> > 
> > if (list_is_singular(&pdev->bus_list))
> > 
> > > + struct pci_bus *bus = pdev->bus->parent;
> > > + /* Traverese the parent bus list to get
> > > +  * pdev & dev for the sibling device.
> > > +  */
> > > + while (bus) {
> > > + if (!list_empty(&bus->devices)) {
> > > + pdev = container_of(bus->devices.next,
> > > + struct pci_dev, bus_list);
> > 
> > pdev = list_first_entry(&bus->devices, struct pci_dev, bus_list);
> > 
> > > + group = iommu_group_get(&pdev->dev);
> > > + break;
> > > + } else
> > > + bus = bus->parent;
> > 
> > Is this ever reached?  Don't you always have bus->self?
> > 
> [Sethi Varun-B16395] Not sure I understand. Trying to get the group
> information from the parent bus, if there are no sibling devices on
> the current bus.

I assume there's always a bridge on a bus, but maybe that bridge
(parent->self) is not in the list of parent->devices?  Is that the case?
If not, then there's always a device on the bus, the bridge that created
it.

> > > + }
> > > + } else {
> > > + /*
> > > +  * Get the pdev & dev for the sibling device
> > > +  */
> > > + pdev = container_of(pdev->bus_list.prev,
> > > + struct pci_dev, bus_list);
> > 
> > How do you know if you're at the head or tail of the list?
> > 
> > struct pci_dev *tmp;
> > list_for_each_entry(tmp, &pdev->bus_list, bus_list) {
> > if (tmp == pdev)
> > continue;
> > 
> > group = iommu_group_get(&tmp->dev);
> > break;
> > }
> > 
> > > + group = iommu_group_get(&pdev->dev);
> > > + }
> > > +
> > > + return group;
> > > +}
> > > +
> > > +static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
> > > +{
> > > + struct iommu_group *group = NULL;
> > > + struct pci_dev *bridge, *dma_pdev = NULL;
> > > + struct pci_controller *pci_ctl;
> > > + bool pci_endpt_partioning;
> > > +
> > > + pci_ctl = pci_bus_to_host(pdev->bus);
> > > + pci_endpt_partioning = check_pci_ctl_endpt_part(pci_ctl);
> > > + /* We can partition PCIe devices so assign device group to the
> > device */
> > > + if (pci_endpt_partioning) {
> > > + bridge = pci_find_upstream_pcie_bridge(pdev);
> > > + if (bridge) {
> > > + if (pci_is_pcie(bridge))
> > > + dma_pdev = pci_get_domain_bus_and_slot(
> > > + pci_domain_nr(pdev->bu

Re: [PATCH v2] vfio: add external user support

2013-06-27 Thread Alex Williamson
On Thu, 2013-06-27 at 17:14 +1000, Alexey Kardashevskiy wrote:
> VFIO is designed to be used via ioctls on file descriptors
> returned by VFIO.
> 
> However in some situations support for an external user is required.
> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> use the existing VFIO groups for exclusive access in real/virtual mode
> in the host kernel to avoid passing map/unmap requests to the user
> space which would made things pretty slow.
> 
> The proposed protocol includes:
> 
> 1. do normal VFIO init stuff such as opening a new container, attaching
> group(s) to it, setting an IOMMU driver for a container. When IOMMU is
> set for a container, all groups in it are considered ready to use by
> an external user.
> 
> 2. pass a fd of the group we want to accelerate to KVM. KVM calls
> vfio_group_iommu_id_from_file() to verify if the group is initialized
> and IOMMU is set for it. The current TCE IOMMU driver marks the whole
> IOMMU table as busy when IOMMU is set for a container what this prevents
> other DMA users from allocating from it so it is safe to pass the group
> to the user space.
> 
> 3. KVM increases the container users counter via
> vfio_group_add_external_user(). This prevents the VFIO group from
> being disposed prior to exiting KVM.
> 
> 4. When KVM is finished and doing cleanup, it releases the group file
> and decrements the container users counter. Everything gets released.
> 
> 5. KVM also keeps the group file as otherwise its fd might have been
> closed at the moment of KVM finish so vfio_group_del_external_user()
> call will not be possible.

This is the wrong order in my mind.  An external user has no business
checking or maintaining any state of a group until it calls
add_external_user().  Only after that call is successful can the user
assume the filep to group relationship is static and get the iommu_id.
Any use of the "external user" API should start with "add" and end with
"del".

> The "vfio: Limit group opens" patch is also required for the consistency.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> 
> v1->v2: added definitions to vfio.h :)
> Should not compile but compiled. Hm.
> 
> ---
>  drivers/vfio/vfio.c  |   54 
> ++
>  include/linux/vfio.h |7 +++
>  2 files changed, 61 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c488da5..40875d2 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1370,6 +1370,60 @@ static const struct file_operations vfio_device_fops = 
> {
>  };
>  
>  /**
> + * External user API, exported by symbols to be linked dynamically.
> + */
> +
> +/* Allows an external user (for example, KVM) to lock an IOMMU group */
> +int vfio_group_add_external_user(struct file *filep)
> +{
> + struct vfio_group *group = filep->private_data;
> +
> + if (filep->f_op != &vfio_group_fops)
> + return -EINVAL;
> +
> + if (!atomic_inc_not_zero(&group->container_users))
> + return -EINVAL;

This is the place where I was suggesting we need tests to match
get_device_fd.  It's not clear what the external user is holding if the
group has no iommu or is not viable here.


if (!group->container->iommu_driver || !vfio_group_viable(group)) {
vfio_group_try_dissolve_container(group);
return -EINVAL;
}

> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_add_external_user);
> +
> +/* Allows an external user (for example, KVM) to unlock an IOMMU group */
> +void vfio_group_del_external_user(struct file *filep)
> +{
> + struct vfio_group *group = filep->private_data;
> +
> + if (WARN_ON(filep->f_op != &vfio_group_fops))
> + return;

How about we make this return int so we can return 0/-EINVAL and the
caller can decide the severity of the response?

> +
> + vfio_group_try_dissolve_container(group);
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_del_external_user);
> +
> +/*
> + * Checks if a group for the specified file can be used by
> + * an external user and returns the IOMMU ID if external use is possible.
> + */
> +int vfio_group_iommu_id_from_file(struct file *filep)

Let's name this in a way that makes it clear that it's part of the
external_user API.  vfio_group_external_user_iommu_id?

> +{
> + int ret;
> + struct vfio_group *group = filep->private_data;
> +
> + if (WARN_ON(filep->f_op != &vfio_group_fops))
> + return -EINVAL;

This one probably doesn't deserve a WARN_ON either, let the caller
blowup if it wants.

> +
> + if (0 == atomic_read(&group->container_users) ||
> + !group->container->iommu_driver ||
> + !vfio_group_viable(group))
> + return -EINVAL;

The above test just becomes a weak test that the caller is  correctly
using the API since we should be enforcing these tests when the external
user is added.  It doesn't hurt to leave them, but it's not very
convincing that the caller

Re: [PATCH v2] vfio: add external user support

2013-06-27 Thread Alex Williamson
On Fri, 2013-06-28 at 08:57 +1000, Alexey Kardashevskiy wrote:
> On 06/28/2013 01:44 AM, Alex Williamson wrote:
> > On Thu, 2013-06-27 at 17:14 +1000, Alexey Kardashevskiy wrote:
> >> VFIO is designed to be used via ioctls on file descriptors
> >> returned by VFIO.
> >>
> >> However in some situations support for an external user is required.
> >> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> >> use the existing VFIO groups for exclusive access in real/virtual mode
> >> in the host kernel to avoid passing map/unmap requests to the user
> >> space which would made things pretty slow.
> >>
> >> The proposed protocol includes:
> >>
> >> 1. do normal VFIO init stuff such as opening a new container, attaching
> >> group(s) to it, setting an IOMMU driver for a container. When IOMMU is
> >> set for a container, all groups in it are considered ready to use by
> >> an external user.
> >>
> >> 2. pass a fd of the group we want to accelerate to KVM. KVM calls
> >> vfio_group_iommu_id_from_file() to verify if the group is initialized
> >> and IOMMU is set for it. The current TCE IOMMU driver marks the whole
> >> IOMMU table as busy when IOMMU is set for a container what this prevents
> >> other DMA users from allocating from it so it is safe to pass the group
> >> to the user space.
> >>
> >> 3. KVM increases the container users counter via
> >> vfio_group_add_external_user(). This prevents the VFIO group from
> >> being disposed prior to exiting KVM.
> >>
> >> 4. When KVM is finished and doing cleanup, it releases the group file
> >> and decrements the container users counter. Everything gets released.
> >>
> >> 5. KVM also keeps the group file as otherwise its fd might have been
> >> closed at the moment of KVM finish so vfio_group_del_external_user()
> >> call will not be possible.
> > 
> > This is the wrong order in my mind.  An external user has no business
> > checking or maintaining any state of a group until it calls
> > add_external_user().  Only after that call is successful can the user
> > assume the filep to group relationship is static and get the iommu_id.
> > Any use of the "external user" API should start with "add" and end with
> > "del".
> 
> Yes, this is what I actually do, just wrong commit message, will fix.
> 
> > 
> >> The "vfio: Limit group opens" patch is also required for the consistency.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >>
> >> v1->v2: added definitions to vfio.h :)
> >> Should not compile but compiled. Hm.
> >>
> >> ---
> >>  drivers/vfio/vfio.c  |   54 
> >> ++
> >>  include/linux/vfio.h |7 +++
> >>  2 files changed, 61 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index c488da5..40875d2 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -1370,6 +1370,60 @@ static const struct file_operations 
> >> vfio_device_fops = {
> >>  };
> >>  
> >>  /**
> >> + * External user API, exported by symbols to be linked dynamically.
> >> + */
> >> +
> >> +/* Allows an external user (for example, KVM) to lock an IOMMU group */
> >> +int vfio_group_add_external_user(struct file *filep)
> >> +{
> >> +  struct vfio_group *group = filep->private_data;
> >> +
> >> +  if (filep->f_op != &vfio_group_fops)
> >> +  return -EINVAL;
> >> +
> >> +  if (!atomic_inc_not_zero(&group->container_users))
> >> +  return -EINVAL;
> > 
> > This is the place where I was suggesting we need tests to match
> > get_device_fd.  It's not clear what the external user is holding if the
> > group has no iommu or is not viable here.
> 
> 
> In my mind this test must include test for iommu id so I would merge it
> with vfio_group_iommu_id_from_file().

I'm not sure what that means.

> Till I check iommu id, I still cannot
> use this group so where to put check for iommu/viable does not really
> matter (for me).

The difference is that getting the group id may just be the first of
several external user API interfaces.  The idea of external user
interface is that from add->del the group is maintained in the same
state as if a device was opened.  If we disassemble t

Re: [PATCH v2] vfio: add external user support

2013-06-27 Thread Alex Williamson
On Fri, 2013-06-28 at 11:38 +1000, Alexey Kardashevskiy wrote:
> On 06/28/2013 10:41 AM, Alex Williamson wrote:
> > On Fri, 2013-06-28 at 08:57 +1000, Alexey Kardashevskiy wrote:
> >> On 06/28/2013 01:44 AM, Alex Williamson wrote:
> >>> On Thu, 2013-06-27 at 17:14 +1000, Alexey Kardashevskiy wrote:
> >>>> VFIO is designed to be used via ioctls on file descriptors
> >>>> returned by VFIO.
> >>>>
> >>>> However in some situations support for an external user is required.
> >>>> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> >>>> use the existing VFIO groups for exclusive access in real/virtual mode
> >>>> in the host kernel to avoid passing map/unmap requests to the user
> >>>> space which would made things pretty slow.
> >>>>
> >>>> The proposed protocol includes:
> >>>>
> >>>> 1. do normal VFIO init stuff such as opening a new container, attaching
> >>>> group(s) to it, setting an IOMMU driver for a container. When IOMMU is
> >>>> set for a container, all groups in it are considered ready to use by
> >>>> an external user.
> >>>>
> >>>> 2. pass a fd of the group we want to accelerate to KVM. KVM calls
> >>>> vfio_group_iommu_id_from_file() to verify if the group is initialized
> >>>> and IOMMU is set for it. The current TCE IOMMU driver marks the whole
> >>>> IOMMU table as busy when IOMMU is set for a container what this prevents
> >>>> other DMA users from allocating from it so it is safe to pass the group
> >>>> to the user space.
> >>>>
> >>>> 3. KVM increases the container users counter via
> >>>> vfio_group_add_external_user(). This prevents the VFIO group from
> >>>> being disposed prior to exiting KVM.
> >>>>
> >>>> 4. When KVM is finished and doing cleanup, it releases the group file
> >>>> and decrements the container users counter. Everything gets released.
> >>>>
> >>>> 5. KVM also keeps the group file as otherwise its fd might have been
> >>>> closed at the moment of KVM finish so vfio_group_del_external_user()
> >>>> call will not be possible.
> >>>
> >>> This is the wrong order in my mind.  An external user has no business
> >>> checking or maintaining any state of a group until it calls
> >>> add_external_user().  Only after that call is successful can the user
> >>> assume the filep to group relationship is static and get the iommu_id.
> >>> Any use of the "external user" API should start with "add" and end with
> >>> "del".
> >>
> >> Yes, this is what I actually do, just wrong commit message, will fix.
> >>
> >>>
> >>>> The "vfio: Limit group opens" patch is also required for the consistency.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy 
> >>>> ---
> >>>>
> >>>> v1->v2: added definitions to vfio.h :)
> >>>> Should not compile but compiled. Hm.
> >>>>
> >>>> ---
> >>>>  drivers/vfio/vfio.c  |   54 
> >>>> ++
> >>>>  include/linux/vfio.h |7 +++
> >>>>  2 files changed, 61 insertions(+)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >>>> index c488da5..40875d2 100644
> >>>> --- a/drivers/vfio/vfio.c
> >>>> +++ b/drivers/vfio/vfio.c
> >>>> @@ -1370,6 +1370,60 @@ static const struct file_operations 
> >>>> vfio_device_fops = {
> >>>>  };
> >>>>  
> >>>>  /**
> >>>> + * External user API, exported by symbols to be linked dynamically.
> >>>> + */
> >>>> +
> >>>> +/* Allows an external user (for example, KVM) to lock an IOMMU group */
> >>>> +int vfio_group_add_external_user(struct file *filep)
> >>>> +{
> >>>> +struct vfio_group *group = filep->private_data;
> >>>> +
> >>>> +if (filep->f_op != &vfio_group_fops)
> >>>> +return -EINVAL;
> >>>> +
> >>>> +if (!atomic_inc_not_zero(&group->container_users))
> >>>> +  

Re: [PATCH 3/3 v17] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-06-28 Thread Alex Williamson
On Fri, 2013-06-28 at 13:08 +0530, Varun Sethi wrote:
> Following is a brief description of the PAMU hardware:
> PAMU determines what action to take and whether to authorize the action on
> the basis of the memory address, a Logical IO Device Number (LIODN), and
> PAACT table (logically) indexed by LIODN and address. Hardware devices which
> need to access memory must provide an LIODN in addition to the memory address.
> 
> Peripheral Access Authorization and Control Tables (PAACTs) are the primary
> data structures used by PAMU. A PAACT is a table of peripheral access
> authorization and control entries (PAACE).Each PAACE defines the range of
> I/O bus address space that is accessible by the LIOD and the associated access
> capabilities.
> 
> There are two types of PAACTs: primary PAACT (PPAACT) and secondary PAACT
> (SPAACT).A given physical I/O device may be able to act as one or more
> independent logical I/O devices (LIODs). Each such logical I/O device is
> assigned an identifier called logical I/O device number (LIODN). A LIODN is
> allocated a contiguous portion of the I/O bus address space called the DSA 
> window
> for performing DSA operations. The DSA window may optionally be divided into
> multiple sub-windows, each of which may be used to map to a region in system
> storage space. The first sub-window is referred to as the primary sub-window
> and the remaining are called secondary sub-windows.
> 
> This patch provides the PAMU driver (fsl_pamu.c) and the corresponding IOMMU
> API implementation (fsl_pamu_domain.c). The PAMU hardware driver (fsl_pamu.c)
> has been derived from the work done by Ashish Kalra and Timur Tabi.
[snip]
> +#define REQ_ACS_FLAGS(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | 
> PCI_ACS_UF)
> +
> +static struct iommu_group *get_device_iommu_group(struct device *dev)
> +{
> + struct iommu_group *group;
> +
> + group = iommu_group_get(dev);
> + if (!group)
> + group = iommu_group_alloc();
> +
> + return group;
> +}
> +
> +static  bool check_pci_ctl_endpt_part(struct pci_controller *pci_ctl)
> +{
> + u32 version;
> +
> + /* Check the PCI controller version number by readding BRR1 register */
> + version = in_be32(pci_ctl->cfg_addr + (PCI_FSL_BRR1 >> 2));
> + version &= PCI_FSL_BRR1_VER;
> + /* If PCI controller version is >= 0x204 we can partition endpoints*/
> + if (version >= 0x204)
> + return 1;
> +
> + return 0;
> +}
> +
> +/* Get iommu group information from peer devices or devices on the parent 
> bus */
> +static struct iommu_group *get_peer_pci_device_group(struct pci_dev *pdev)
> +{
> + struct pci_dev *tmp;
> + struct iommu_group *group = NULL;
> +
> + /* check if this is the first device on the bus*/
> + if (list_is_singular(&pdev->bus_list)) {
> + struct pci_bus *bus = pdev->bus->parent;
> + /* Traverese the parent bus list to get
> +  * the iommu group for devices on the
> +  * parent bus.
> +  */
> + while (bus && !group) {
> + list_for_each_entry(tmp, &bus->devices, bus_list) {
> + group = iommu_group_get(&tmp->dev);
> + if (group)
> + break;
> + }
> +
> + bus = bus->parent;
> + }
> + } else {
> + /*
> +  * Get the iommu group for the sibling device
> +  */
> + list_for_each_entry(tmp, &pdev->bus_list, bus_list) {
> + if (tmp == pdev)
> + continue;
> + group = iommu_group_get(&tmp->dev);
> + if (group)
> + break;
> + }
> + }
> +
> + return group;

Can't we handle both of these with one loop?

struct pci_bus *bus = pdev->bus;
while (bus) {
list_for_each_entry(tmp, &bus->devices, bus_list) {
if (tmp == pdev)
continue;
group = iommu_group_get(&tmp->dev);
if (group)
return group;
}
bus = bus->parent;
}
return NULL;

get_shared_pci_device_group() might be another naming option.

> +}
> +
> +static struct iommu_group *get_pci_device_group(struct pci_dev *pdev)
> +{
> + struct pci_controller *pci_ctl;
> + bool pci_endpt_partioning;
> + struct iommu_group *group = NULL;
> + struct pci_dev *bridge, *dma_pdev = NULL;
> +
> + pci_ctl = pci_bus_to_host(pdev->bus);
> + pci_endpt_partioning = check_pci_ctl_endpt_part(pci_ctl);
> + /* We can partition PCIe devices so assign device group to the device */
> + if (pci_endpt_partioning) {
> + bridge = pci_find_upstream_pcie_bridge(pdev);
> + if (bridge) {
> + if (pci_is_pcie(bridge))
> + dma_pdev = pci_get_do

Re: [PATCH 3/3 v18] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-07-01 Thread Alex Williamson
On Mon, 2013-07-01 at 15:41 +0530, Varun Sethi wrote:
> Following is a brief description of the PAMU hardware:
> PAMU determines what action to take and whether to authorize the action on
> the basis of the memory address, a Logical IO Device Number (LIODN), and
> PAACT table (logically) indexed by LIODN and address. Hardware devices which
> need to access memory must provide an LIODN in addition to the memory address.
> 
> Peripheral Access Authorization and Control Tables (PAACTs) are the primary
> data structures used by PAMU. A PAACT is a table of peripheral access
> authorization and control entries (PAACE).Each PAACE defines the range of
> I/O bus address space that is accessible by the LIOD and the associated access
> capabilities.
> 
> There are two types of PAACTs: primary PAACT (PPAACT) and secondary PAACT
> (SPAACT).A given physical I/O device may be able to act as one or more
> independent logical I/O devices (LIODs). Each such logical I/O device is
> assigned an identifier called logical I/O device number (LIODN). A LIODN is
> allocated a contiguous portion of the I/O bus address space called the DSA 
> window
> for performing DSA operations. The DSA window may optionally be divided into
> multiple sub-windows, each of which may be used to map to a region in system
> storage space. The first sub-window is referred to as the primary sub-window
> and the remaining are called secondary sub-windows.
> 
> This patch provides the PAMU driver (fsl_pamu.c) and the corresponding IOMMU
> API implementation (fsl_pamu_domain.c). The PAMU hardware driver (fsl_pamu.c)
> has been derived from the work done by Ashish Kalra and Timur Tabi.
> 
> Signed-off-by: Timur Tabi 
> Signed-off-by: Varun Sethi 
> ---

For iommu group support

Acked-by: Alex Williamson 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/8] vfio: add external user support

2013-07-08 Thread Alex Williamson
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
> VFIO is designed to be used via ioctls on file descriptors
> returned by VFIO.
> 
> However in some situations support for an external user is required.
> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> use the existing VFIO groups for exclusive access in real/virtual mode
> on a host to avoid passing map/unmap requests to the user space which
> would made things pretty slow.
> 
> The proposed protocol includes:
> 
> 1. do normal VFIO init stuff such as opening a new container, attaching
> group(s) to it, setting an IOMMU driver for a container. When IOMMU is
> set for a container, all groups in it are considered ready to use by
> an external user.
> 
> 2. pass a fd of the group we want to accelerate to KVM. KVM calls
> vfio_group_get_external_user() to verify if the group is initialized,
> IOMMU is set for it and increment the container user counter to prevent
> the VFIO group from disposal prior to KVM exit.
> The current TCE IOMMU driver marks the whole IOMMU table as busy when
> IOMMU is set for a container what prevents other DMA users from
> allocating from it so it is safe to grant user space access to it.
> 
> 3. KVM calls vfio_external_user_iommu_id() to obtian an IOMMU ID which
> KVM uses to get an iommu_group struct for later use.
> 
> 4. When KVM is finished, it calls vfio_group_put_external_user() to
> release the VFIO group by decrementing the container user counter.
> Everything gets released.
> 
> The "vfio: Limit group opens" patch is also required for the consistency.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c488da5..57aa191 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1370,6 +1370,62 @@ static const struct file_operations vfio_device_fops = 
> {
>  };
>  
>  /**
> + * External user API, exported by symbols to be linked dynamically.
> + *
> + * The protocol includes:
> + *  1. do normal VFIO init operation:
> + *   - opening a new container;
> + *   - attaching group(s) to it;
> + *   - setting an IOMMU driver for a container.
> + * When IOMMU is set for a container, all groups in it are
> + * considered ready to use by an external user.
> + *
> + * 2. The user space passed a group fd which we want to accelerate in
> + * KVM. KVM uses vfio_group_get_external_user() to verify that:
> + *   - the group is initialized;
> + *   - IOMMU is set for it.
> + * Then vfio_group_get_external_user() increments the container user
> + * counter to prevent the VFIO group from disposal prior to KVM exit.
> + *
> + * 3. KVM calls vfio_external_user_iommu_id() to know an IOMMU ID which
> + * KVM uses to get an iommu_group struct for later use.
> + *
> + * 4. When KVM is finished, it calls vfio_group_put_external_user() to
> + * release the VFIO group by decrementing the container user counter.

nit, the interface is for any external user, not just kvm.

> + */
> +struct vfio_group *vfio_group_get_external_user(struct file *filep)
> +{
> + struct vfio_group *group = filep->private_data;
> +
> + if (filep->f_op != &vfio_group_fops)
> + return NULL;

ERR_PTR(-EINVAL)

There also needs to be a vfio_group_get(group) here and put in error
cases.

> +
> + if (!atomic_inc_not_zero(&group->container_users))
> + return NULL;

ERR_PTR(-EINVAL)

> +
> + if (!group->container->iommu_driver ||
> + !vfio_group_viable(group)) {
> + atomic_dec(&group->container_users);
> + return NULL;

ERR_PTR(-EINVAL)

> + }
> +
> + return group;
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
> +
> +void vfio_group_put_external_user(struct vfio_group *group)
> +{
> + vfio_group_try_dissolve_container(group);

And a vfio_group_put(group) here

> +}
> +EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
> +
> +int vfio_external_user_iommu_id(struct vfio_group *group)
> +{
> + return iommu_group_id(group->iommu_group);
> +}
> +EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
> +
> +/**
>   * Module/class support
>   */
>  static char *vfio_devnode(struct device *dev, umode_t *mode)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index ac8d488..24579a0 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
>   TYPE tmp;   \
>   offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
>  
> +/*
> + * External user API
> + */
> +extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
> +extern void vfio_group_put_external_user(struct vfio_group *group);
> +extern int vfio_external_user_iommu_id(struct vfio_group *group);
> +
>  #endif /* VFIO_H */



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/8] vfio: add external user support

2013-07-09 Thread Alex Williamson
On Tue, 2013-07-09 at 15:40 +1000, Alexey Kardashevskiy wrote:
> On 07/09/2013 07:52 AM, Alex Williamson wrote:
> > On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
> >> VFIO is designed to be used via ioctls on file descriptors
> >> returned by VFIO.
> >>
> >> However in some situations support for an external user is required.
> >> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> >> use the existing VFIO groups for exclusive access in real/virtual mode
> >> on a host to avoid passing map/unmap requests to the user space which
> >> would made things pretty slow.
> >>
> >> The proposed protocol includes:
> >>
> >> 1. do normal VFIO init stuff such as opening a new container, attaching
> >> group(s) to it, setting an IOMMU driver for a container. When IOMMU is
> >> set for a container, all groups in it are considered ready to use by
> >> an external user.
> >>
> >> 2. pass a fd of the group we want to accelerate to KVM. KVM calls
> >> vfio_group_get_external_user() to verify if the group is initialized,
> >> IOMMU is set for it and increment the container user counter to prevent
> >> the VFIO group from disposal prior to KVM exit.
> >> The current TCE IOMMU driver marks the whole IOMMU table as busy when
> >> IOMMU is set for a container what prevents other DMA users from
> >> allocating from it so it is safe to grant user space access to it.
> >>
> >> 3. KVM calls vfio_external_user_iommu_id() to obtian an IOMMU ID which
> >> KVM uses to get an iommu_group struct for later use.
> >>
> >> 4. When KVM is finished, it calls vfio_group_put_external_user() to
> >> release the VFIO group by decrementing the container user counter.
> >> Everything gets released.
> >>
> >> The "vfio: Limit group opens" patch is also required for the consistency.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> >> index c488da5..57aa191 100644
> >> --- a/drivers/vfio/vfio.c
> >> +++ b/drivers/vfio/vfio.c
> >> @@ -1370,6 +1370,62 @@ static const struct file_operations 
> >> vfio_device_fops = {
> >>  };
> >>  
> >>  /**
> >> + * External user API, exported by symbols to be linked dynamically.
> >> + *
> >> + * The protocol includes:
> >> + *  1. do normal VFIO init operation:
> >> + *- opening a new container;
> >> + *- attaching group(s) to it;
> >> + *- setting an IOMMU driver for a container.
> >> + * When IOMMU is set for a container, all groups in it are
> >> + * considered ready to use by an external user.
> >> + *
> >> + * 2. The user space passed a group fd which we want to accelerate in
> >> + * KVM. KVM uses vfio_group_get_external_user() to verify that:
> >> + *- the group is initialized;
> >> + *- IOMMU is set for it.
> >> + * Then vfio_group_get_external_user() increments the container user
> >> + * counter to prevent the VFIO group from disposal prior to KVM exit.
> >> + *
> >> + * 3. KVM calls vfio_external_user_iommu_id() to know an IOMMU ID which
> >> + * KVM uses to get an iommu_group struct for later use.
> >> + *
> >> + * 4. When KVM is finished, it calls vfio_group_put_external_user() to
> >> + * release the VFIO group by decrementing the container user counter.
> > 
> > nit, the interface is for any external user, not just kvm.
> 
> s/KVM/An external user/ ?
> Or add "the description below uses KVM just as an example of an external 
> user"?

Give a generic API description, KVM is just an example.

> >> + */
> >> +struct vfio_group *vfio_group_get_external_user(struct file *filep)
> >> +{
> >> +  struct vfio_group *group = filep->private_data;
> >> +
> >> +  if (filep->f_op != &vfio_group_fops)
> >> +  return NULL;
> > 
> > ERR_PTR(-EINVAL)
> > 
> > There also needs to be a vfio_group_get(group) here and put in error
> > cases.
> 
> 
> Is that because I do not hold a reference to the file anymore?

We were debating whether it was needed even with the file reference
because we weren't sure that we wanted to trust the user to hold the
reference.  Since we're now passing an object, we absolutely must
increase the reference count on the object for this user.  Thanks,

Alex

> >> +
> >> +  i

Re: [PATCH 03/10] vfio: add external user support

2013-07-22 Thread Alex Williamson
On Tue, 2013-07-16 at 10:53 +1000, Alexey Kardashevskiy wrote:
> VFIO is designed to be used via ioctls on file descriptors
> returned by VFIO.
> 
> However in some situations support for an external user is required.
> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> use the existing VFIO groups for exclusive access in real/virtual mode
> on a host to avoid passing map/unmap requests to the user space which
> would made things pretty slow.
> 
> The protocol includes:
> 
> 1. do normal VFIO init operation:
>   - opening a new container;
>   - attaching group(s) to it;
>   - setting an IOMMU driver for a container.
> When IOMMU is set for a container, all groups in it are
> considered ready to use by an external user.
> 
> 2. User space passes a group fd to an external user.
> The external user calls vfio_group_get_external_user()
> to verify that:
>   - the group is initialized;
>   - IOMMU is set for it.
> If both checks passed, vfio_group_get_external_user()
> increments the container user counter to prevent
> the VFIO group from disposal before KVM exits.
> 
> 3. The external user calls vfio_external_user_iommu_id()
> to know an IOMMU ID. PPC64 KVM uses it to link logical bus
> number (LIOBN) with IOMMU ID.
> 
> 4. When the external KVM finishes, it calls
> vfio_group_put_external_user() to release the VFIO group.
> This call decrements the container user counter.
> Everything gets released.
> 
> The "vfio: Limit group opens" patch is also required for the consistency.
> 
> Signed-off-by: Alexey Kardashevskiy 

This looks fine to me.  Is the plan to add this through the ppc tree
again?  Thanks,

Alex

> ---
> Changes:
> 2013/07/11:
> * added vfio_group_get()/vfio_group_put()
> * protocol description changed
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  drivers/vfio/vfio.c  | 62 
> 
>  include/linux/vfio.h |  7 ++
>  2 files changed, 69 insertions(+)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index c488da5..58b034b 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1370,6 +1370,68 @@ static const struct file_operations vfio_device_fops = 
> {
>  };
>  
>  /**
> + * External user API, exported by symbols to be linked dynamically.
> + *
> + * The protocol includes:
> + *  1. do normal VFIO init operation:
> + *   - opening a new container;
> + *   - attaching group(s) to it;
> + *   - setting an IOMMU driver for a container.
> + * When IOMMU is set for a container, all groups in it are
> + * considered ready to use by an external user.
> + *
> + * 2. User space passes a group fd to an external user.
> + * The external user calls vfio_group_get_external_user()
> + * to verify that:
> + *   - the group is initialized;
> + *   - IOMMU is set for it.
> + * If both checks passed, vfio_group_get_external_user()
> + * increments the container user counter to prevent
> + * the VFIO group from disposal before KVM exits.
> + *
> + * 3. The external user calls vfio_external_user_iommu_id()
> + * to know an IOMMU ID.
> + *
> + * 4. When the external KVM finishes, it calls
> + * vfio_group_put_external_user() to release the VFIO group.
> + * This call decrements the container user counter.
> + */
> +struct vfio_group *vfio_group_get_external_user(struct file *filep)
> +{
> + struct vfio_group *group = filep->private_data;
> +
> + if (filep->f_op != &vfio_group_fops)
> + return ERR_PTR(-EINVAL);
> +
> + if (!atomic_inc_not_zero(&group->container_users))
> + return ERR_PTR(-EINVAL);
> +
> + if (!group->container->iommu_driver ||
> + !vfio_group_viable(group)) {
> + atomic_dec(&group->container_users);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + vfio_group_get(group);
> +
> + return group;
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
> +
> +void vfio_group_put_external_user(struct vfio_group *group)
> +{
> + vfio_group_put(group);
> + vfio_group_try_dissolve_container(group);
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
> +
> +int vfio_external_user_iommu_id(struct vfio_group *group)
> +{
> + return iommu_group_id(group->iommu_group);
> +}
> +EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
> +
> +/**
>   * Module/class support
>   */
>  static char *vfio_devnode(struct device *dev, umode_t *mode)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index ac8d488..24579a0 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
>   TYPE tmp;   \
>   offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
>  
> +/*
> + * External user API
> + */
> +extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
> +extern void vfio_group_put_external_user(struct vfio_group *group);
> +extern int vfio_external_user_i

Re: [PATCH 03/10] vfio: add external user support

2013-08-05 Thread Alex Williamson
On Tue, 2013-07-23 at 19:07 +1000, Alexey Kardashevskiy wrote:
> On 07/23/2013 12:23 PM, Alex Williamson wrote:
> > On Tue, 2013-07-16 at 10:53 +1000, Alexey Kardashevskiy wrote:
> >> VFIO is designed to be used via ioctls on file descriptors
> >> returned by VFIO.
> >>
> >> However in some situations support for an external user is required.
> >> The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
> >> use the existing VFIO groups for exclusive access in real/virtual mode
> >> on a host to avoid passing map/unmap requests to the user space which
> >> would made things pretty slow.
> >>
> >> The protocol includes:
> >>
> >> 1. do normal VFIO init operation:
> >>- opening a new container;
> >>- attaching group(s) to it;
> >>- setting an IOMMU driver for a container.
> >> When IOMMU is set for a container, all groups in it are
> >> considered ready to use by an external user.
> >>
> >> 2. User space passes a group fd to an external user.
> >> The external user calls vfio_group_get_external_user()
> >> to verify that:
> >>- the group is initialized;
> >>- IOMMU is set for it.
> >> If both checks passed, vfio_group_get_external_user()
> >> increments the container user counter to prevent
> >> the VFIO group from disposal before KVM exits.
> >>
> >> 3. The external user calls vfio_external_user_iommu_id()
> >> to know an IOMMU ID. PPC64 KVM uses it to link logical bus
> >> number (LIOBN) with IOMMU ID.
> >>
> >> 4. When the external KVM finishes, it calls
> >> vfio_group_put_external_user() to release the VFIO group.
> >> This call decrements the container user counter.
> >> Everything gets released.
> >>
> >> The "vfio: Limit group opens" patch is also required for the consistency.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> > 
> > This looks fine to me.  Is the plan to add this through the ppc tree
> > again?  Thanks,
> 
> 
> Nope, better to add this through your tree. And faster for sure :) Thanks!

Applied to my next branch for v3.12.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/2] powerpc/iommu: check dev->iommu_group before remove a device from iommu_group

2013-08-22 Thread Alex Williamson
On Thu, 2013-08-22 at 15:52 +0800, Wei Yang wrote:
> On Thu, Aug 22, 2013 at 05:23:34PM +1000, Alexey Kardashevskiy wrote:
> >On 08/19/2013 11:55 AM, Wei Yang wrote:
> >> On Mon, Aug 19, 2013 at 11:39:49AM +1000, Alexey Kardashevskiy wrote:
> >>> On 08/19/2013 11:29 AM, Wei Yang wrote:
>  On Fri, Aug 16, 2013 at 08:15:36PM +1000, Alexey Kardashevskiy wrote:
> > On 08/16/2013 08:08 PM, Wei Yang wrote:
> >> ---
> >>  arch/powerpc/kernel/iommu.c |3 ++-
> >>  1 files changed, 2 insertions(+), 1 deletions(-)
> >>
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index b20ff17..5abf7c3 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -1149,7 +1149,8 @@ static int iommu_bus_notifier(struct 
> >> notifier_block *nb,
> >>case BUS_NOTIFY_ADD_DEVICE:
> >>return iommu_add_device(dev);
> >>case BUS_NOTIFY_DEL_DEVICE:
> >> -  iommu_del_device(dev);
> >> +  if (dev->iommu_group)
> >> +  iommu_del_device(dev);
> >>return 0;
> >>default:
> >>return 0;
> >>
> >
> > This one seems redundant, no?
> 
>  Sorry for the late.
> 
>  Yes, these two patches have the same purpose to guard the system, while 
>  in two
>  different places.  One is in powernv platform, the other is in the 
>  generic iommu 
>  driver.
> 
>  The one in powernv platform is used to correct the original logic.
> 
>  The one in generic iommu driver is to keep system safe in case other 
>  platform to
>  call iommu_group_remove_device() without the check.
> >>>
> >>>
> >>> But I am moving bus notifier to powernv code (posted a patch last week,
> >>> otherwise Freescale's IOMMU conflicted) so this won't be the case.
> >> 
> >> Yes, I see the patch.
> >> 
> >> This means other platforms, besides powernv, will check the 
> >> dev->iommu_group
> >> before remove the device? This would be a convention?
> >> 
> >> If this is the case, the second patch is enough. We don't need to check it 
> >> in
> >> generic iommu driver.
> >> 
> >> Since I am not very familiar with the code convention, I post these two
> >> patches together. This doesn't mean I need to push both of them. Your 
> >> comments
> >> are welcome, lets me understand which one is more suitable in this case.
> >
> >
> >Ok. So. I included the check in the bus notifier which I moved to powernv
> >platform, I guess I'll repost the series soon.
> 
> Thanks, this check will guard the powernv platform.
> 
> >
> >Good luck with pushing the fix for drivers/iommu/iommu.c :)
> >
> 
> Alex,
> 
> Sorry for not including you in the very beginning, which may spend you more
> efforts to track previous mails in this thread.
> 
> Do you think it is reasonable to check the dev->iommu_group in
> iommu_group_remove_device()? Or we can count on the bus notifier to check it?
> 
> Welcome your suggestions~

I don't really see the point of patch 1/2. iommu_group_remove_device()
is specifically to remove a device from an iommu_group, so why would you
call it on a device that's not part of an iommu_group.  If you want to
avoid testing dev->iommu_group, then implement the .remove_device
callback rather than using the notifier.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/2] powerpc/iommu: check dev->iommu_group before remove a device from iommu_group

2013-08-22 Thread Alex Williamson
On Thu, 2013-08-22 at 23:41 +0800, Wei Yang wrote:
> On Thu, Aug 22, 2013 at 09:28:23AM -0600, Alex Williamson wrote:
> >On Thu, 2013-08-22 at 15:52 +0800, Wei Yang wrote:
> >> On Thu, Aug 22, 2013 at 05:23:34PM +1000, Alexey Kardashevskiy wrote:
> >> >On 08/19/2013 11:55 AM, Wei Yang wrote:
> >> >> On Mon, Aug 19, 2013 at 11:39:49AM +1000, Alexey Kardashevskiy wrote:
> >> >>> On 08/19/2013 11:29 AM, Wei Yang wrote:
> >> >>>> On Fri, Aug 16, 2013 at 08:15:36PM +1000, Alexey Kardashevskiy wrote:
> >> >>>>> On 08/16/2013 08:08 PM, Wei Yang wrote:
> >> >>>>>> ---
> >> >>>>>>  arch/powerpc/kernel/iommu.c |3 ++-
> >> >>>>>>  1 files changed, 2 insertions(+), 1 deletions(-)
> >> >>>>>>
> >> >>>>>> diff --git a/arch/powerpc/kernel/iommu.c 
> >> >>>>>> b/arch/powerpc/kernel/iommu.c
> >> >>>>>> index b20ff17..5abf7c3 100644
> >> >>>>>> --- a/arch/powerpc/kernel/iommu.c
> >> >>>>>> +++ b/arch/powerpc/kernel/iommu.c
> >> >>>>>> @@ -1149,7 +1149,8 @@ static int iommu_bus_notifier(struct 
> >> >>>>>> notifier_block *nb,
> >> >>>>>> case BUS_NOTIFY_ADD_DEVICE:
> >> >>>>>> return iommu_add_device(dev);
> >> >>>>>> case BUS_NOTIFY_DEL_DEVICE:
> >> >>>>>> -   iommu_del_device(dev);
> >> >>>>>> +   if (dev->iommu_group)
> >> >>>>>> +   iommu_del_device(dev);
> >> >>>>>> return 0;
> >> >>>>>> default:
> >> >>>>>> return 0;
> >> >>>>>>
> >> >>>>>
> >> >>>>> This one seems redundant, no?
> >> >>>>
> >> >>>> Sorry for the late.
> >> >>>>
> >> >>>> Yes, these two patches have the same purpose to guard the system, 
> >> >>>> while in two
> >> >>>> different places.  One is in powernv platform, the other is in the 
> >> >>>> generic iommu 
> >> >>>> driver.
> >> >>>>
> >> >>>> The one in powernv platform is used to correct the original logic.
> >> >>>>
> >> >>>> The one in generic iommu driver is to keep system safe in case other 
> >> >>>> platform to
> >> >>>> call iommu_group_remove_device() without the check.
> >> >>>
> >> >>>
> >> >>> But I am moving bus notifier to powernv code (posted a patch last week,
> >> >>> otherwise Freescale's IOMMU conflicted) so this won't be the case.
> >> >> 
> >> >> Yes, I see the patch.
> >> >> 
> >> >> This means other platforms, besides powernv, will check the 
> >> >> dev->iommu_group
> >> >> before remove the device? This would be a convention?
> >> >> 
> >> >> If this is the case, the second patch is enough. We don't need to check 
> >> >> it in
> >> >> generic iommu driver.
> >> >> 
> >> >> Since I am not very familiar with the code convention, I post these two
> >> >> patches together. This doesn't mean I need to push both of them. Your 
> >> >> comments
> >> >> are welcome, lets me understand which one is more suitable in this case.
> >> >
> >> >
> >> >Ok. So. I included the check in the bus notifier which I moved to powernv
> >> >platform, I guess I'll repost the series soon.
> >> 
> >> Thanks, this check will guard the powernv platform.
> >> 
> >> >
> >> >Good luck with pushing the fix for drivers/iommu/iommu.c :)
> >> >
> >> 
> >> Alex,
> >> 
> >> Sorry for not including you in the very beginning, which may spend you more
> >> efforts to track previous mails in this thread.
> >> 
> >> Do you think it is reasonable to check the dev->iommu_group in
> >> iommu_group_remove_device()? Or we can count on the bus notifier to check 
> >> it?
> >> 
> >> Welcome your suggestion

Re: [PATCH] iommu: WARN_ON when removing a device with no iommu_group associated

2013-08-22 Thread Alex Williamson
[+cc iommu]

On Fri, 2013-08-23 at 09:55 +0800, Wei Yang wrote:
> When removing a device from the system, iommu_group driver will try to
> disconnect it from its group. While in some cases, one device may not
> associated with any iommu_group. For example, not enough DMA address space.
> 
> In the generic bus notification, it will check dev->iommu_group before calling
> iommu_group_remove_device(). While in some cases, developers may call
> iommu_group_remove_device() in a different code path and without check. For
> those devices with dev->iommu_group set to NULL, kernel will crash.
> 
> This patch gives a warning and return when trying to remove a device from an
> iommu_group with dev->iommu_group set to NULL. This helps to indicate some bad
> behavior and also guard the kernel.
> 
> Signed-off-by: Wei Yang 

Acked-by: Alex Williamson 

> ---
>  drivers/iommu/iommu.c |3 +++
>  1 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index fbe9ca7..43396f0 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -379,6 +379,9 @@ void iommu_group_remove_device(struct device *dev)
>   struct iommu_group *group = dev->iommu_group;
>   struct iommu_device *tmp_device, *device = NULL;
>  
> + if (WARN_ON(!group))
> + return;
> +
>   /* Pre-notify listeners that a device is being removed. */
>   blocking_notifier_call_chain(&group->notifier,
>IOMMU_GROUP_NOTIFY_DEL_DEVICE, dev);



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 5/7] iommu: supress loff_t compilation error on powerpc

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> Signed-off-by: Bharat Bhushan 
> ---
>  drivers/vfio/pci/vfio_pci_rdwr.c |3 ++-
>  1 files changed, 2 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c 
> b/drivers/vfio/pci/vfio_pci_rdwr.c
> index 210db24..8a8156a 100644
> --- a/drivers/vfio/pci/vfio_pci_rdwr.c
> +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
> @@ -181,7 +181,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, 
> char __user *buf,
>  size_t count, loff_t *ppos, bool iswrite)
>  {
>   int ret;
> - loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
> + loff_t off;
> + u64 pos = (u64 )(*ppos & VFIO_PCI_OFFSET_MASK);
>   void __iomem *iomem = NULL;
>   unsigned int rsrc;
>   bool is_ioport;

What's the compile error that this fixes?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> This api return the iommu domain to which the device is attached.
> The iommu_domain is required for making API calls related to iommu.
> Follow up patches which use this API to know iommu maping.
> 
> Signed-off-by: Bharat Bhushan 
> ---
>  drivers/iommu/iommu.c |   10 ++
>  include/linux/iommu.h |7 +++
>  2 files changed, 17 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index fbe9ca7..6ac5f50 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain *domain, 
> struct device *dev)
>  }
>  EXPORT_SYMBOL_GPL(iommu_detach_device);
>  
> +struct iommu_domain *iommu_get_dev_domain(struct device *dev)
> +{
> + struct iommu_ops *ops = dev->bus->iommu_ops;
> +
> + if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> + return NULL;
> +
> + return ops->get_dev_iommu_domain(dev);
> +}
> +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);

What prevents this from racing iommu_domain_free()?  There's no
references acquired, so there's no reason for the caller to assume the
pointer is valid.

>  /*
>   * IOMMU groups are really the natrual working unit of the IOMMU, but
>   * the IOMMU API works on domains and devices.  Bridge that gap by
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 7ea319e..fa046bd 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -127,6 +127,7 @@ struct iommu_ops {
>   int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
>   /* Get the numer of window per domain */
>   u32 (*domain_get_windows)(struct iommu_domain *domain);
> + struct iommu_domain *(*get_dev_iommu_domain)(struct device *dev);
>  
>   unsigned long pgsize_bitmap;
>  };
> @@ -190,6 +191,7 @@ extern int iommu_domain_window_enable(struct iommu_domain 
> *domain, u32 wnd_nr,
> phys_addr_t offset, u64 size,
> int prot);
>  extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 
> wnd_nr);
> +extern struct iommu_domain *iommu_get_dev_domain(struct device *dev);
>  /**
>   * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
>   * @domain: the iommu domain where the fault has happened
> @@ -284,6 +286,11 @@ static inline void iommu_domain_window_disable(struct 
> iommu_domain *domain,
>  {
>  }
>  
> +static inline struct iommu_domain *iommu_get_dev_domain(struct device *dev)
> +{
> + return NULL;
> +}
> +
>  static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, 
> dma_addr_t iova)
>  {
>   return 0;



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 6/7] vfio: moving some functions in common file

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> Some function defined in vfio_iommu_type1.c were common and
> we want to use these for FSL IOMMU (PAMU) and iommu-none driver.
> So some of them are moved to vfio_iommu_common.c
> 
> I think we can do more of that but we will take this step by step.
> 
> Signed-off-by: Bharat Bhushan 
> ---
>  drivers/vfio/Makefile|4 +-
>  drivers/vfio/vfio_iommu_common.c |  235 
> ++
>  drivers/vfio/vfio_iommu_common.h |   30 +
>  drivers/vfio/vfio_iommu_type1.c  |  206 +-
>  4 files changed, 268 insertions(+), 207 deletions(-)
>  create mode 100644 drivers/vfio/vfio_iommu_common.c
>  create mode 100644 drivers/vfio/vfio_iommu_common.h
> 
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 72bfabc..c5792ec 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,4 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
> -obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> -obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> +obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o 
> vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_common.c 
> b/drivers/vfio/vfio_iommu_common.c
> new file mode 100644
> index 000..8bdc0ea
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_common.c
> @@ -0,0 +1,235 @@
> +/*
> + * VFIO: Common code for vfio IOMMU support
> + *
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + * Author: Alex Williamson 
> + * Author: Bharat Bhushan 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, p...@cisco.com
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include/* pci_bus_type */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 

Please cleanup includes on both the source and target files.  You
obviously don't need linux/pci.h here for one.

> +
> +static bool disable_hugepages;
> +module_param_named(disable_hugepages,
> +disable_hugepages, bool, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(disable_hugepages,
> +  "Disable VFIO IOMMU support for IOMMU hugepages.");
> +
> +struct vwork {
> + struct mm_struct*mm;
> + longnpage;
> + struct work_struct  work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +void vfio_lock_acct_bg(struct work_struct *work)
> +{
> + struct vwork *vwork = container_of(work, struct vwork, work);
> + struct mm_struct *mm;
> +
> + mm = vwork->mm;
> + down_write(&mm->mmap_sem);
> + mm->locked_vm += vwork->npage;
> + up_write(&mm->mmap_sem);
> + mmput(mm);
> + kfree(vwork);
> +}
> +
> +void vfio_lock_acct(long npage)
> +{
> + struct vwork *vwork;
> + struct mm_struct *mm;
> +
> + if (!current->mm || !npage)
> + return; /* process exited or nothing to do */
> +
> + if (down_write_trylock(¤t->mm->mmap_sem)) {
> + current->mm->locked_vm += npage;
> + up_write(¤t->mm->mmap_sem);
> + return;
> + }
> +
> + /*
> +  * Couldn't get mmap_sem lock, so must setup to update
> +  * mm->locked_vm later. If locked_vm were atomic, we
> +  * wouldn't need this silliness
> +  */
> + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> + if (!vwork)
> + return;
> + mm = get_task_mm(current);
> + if (!mm) {
> + kfree(vwork);
> + return;
> + }
> + INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> + vwork->mm = mm;
> + vwork->npage = npage;
> + schedule_work(&vwork->work);
> +}
> +
> +/*
> + * Some mappings aren't backed by a struct page, for example an mmap'd
> + * MMIO range for our own or another device.  These use a different
> + * pfn conversion and shouldn't be tracked as locked pages.
> + */
> +bool is_invalid_reserved_pfn(unsigned long pfn)
> +{
> + if (pfn_valid(pfn)) {
> + bool reserved;
> + struct page *tail = pfn_

Re: [PATCH 7/7] vfio pci: Add vfio iommu implementation for FSL_PAMU

2013-09-25 Thread Alex Williamson
On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> This patch adds vfio iommu support for Freescale IOMMU
> (PAMU - Peripheral Access Management Unit).
> 
> The Freescale PAMU is an aperture-based IOMMU with the following
> characteristics.  Each device has an entry in a table in memory
> describing the iova->phys mapping. The mapping has:
>   -an overall aperture that is power of 2 sized, and has a start iova that
>is naturally aligned
>   -has 1 or more windows within the aperture
>   -number of windows must be power of 2, max is 256
>   -size of each window is determined by aperture size / # of windows
>   -iova of each window is determined by aperture start iova / # of windows
>   -the mapped region in each window can be different than
>the window size...mapping must power of 2
>   -physical address of the mapping must be naturally aligned
>with the mapping size
> 
> Some of the code is derived from TYPE1 iommu (driver/vfio/vfio_iommu_type1.c).
> 
> Signed-off-by: Bharat Bhushan 
> ---
>  drivers/vfio/Kconfig   |6 +
>  drivers/vfio/Makefile  |1 +
>  drivers/vfio/vfio_iommu_fsl_pamu.c |  952 
> 
>  include/uapi/linux/vfio.h  |  100 
>  4 files changed, 1059 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vfio/vfio_iommu_fsl_pamu.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 26b3d9d..7d1da26 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -8,11 +8,17 @@ config VFIO_IOMMU_SPAPR_TCE
>   depends on VFIO && SPAPR_TCE_IOMMU
>   default n
>  
> +config VFIO_IOMMU_FSL_PAMU
> + tristate
> + depends on VFIO
> + default n
> +
>  menuconfig VFIO
>   tristate "VFIO Non-Privileged userspace driver framework"
>   depends on IOMMU_API
>   select VFIO_IOMMU_TYPE1 if X86
>   select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
> + select VFIO_IOMMU_FSL_PAMU if FSL_PAMU
>   help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index c5792ec..7461350 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,4 +1,5 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
>  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o 
> vfio_iommu_spapr_tce.o
> +obj-$(CONFIG_VFIO_IOMMU_FSL_PAMU) += vfio_iommu_common.o 
> vfio_iommu_fsl_pamu.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_fsl_pamu.c 
> b/drivers/vfio/vfio_iommu_fsl_pamu.c
> new file mode 100644
> index 000..b29365f
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_fsl_pamu.c
> @@ -0,0 +1,952 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for FSL PAMU IOMMU
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright (C) 2013 Freescale Semiconductor, Inc.
> + *
> + * Author: Bharat Bhushan 
> + *
> + * This file is derived from driver/vfio/vfio_iommu_type1.c
> + *
> + * The Freescale PAMU is an aperture-based IOMMU with the following
> + * characteristics.  Each device has an entry in a table in memory
> + * describing the iova->phys mapping. The mapping has:
> + *  -an overall aperture that is power of 2 sized, and has a start iova that
> + *   is naturally aligned
> + *  -has 1 or more windows within the aperture
> + * -number of windows must be power of 2, max is 256
> + * -size of each window is determined by aperture size / # of windows
> + * -iova of each window is determined by aperture start iova / # of 
> windows
> + * -the mapped region in each window can be different than
> + *  the window size...mapping must power of 2
> + * -physical address of the mapping must be naturally aligned
> + *  with the mapping size
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include/* pci_bus_type */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "vfio_iommu_common.h"
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "Bharat Bhushan "
> +#define DRIVER_DESC "FSL PAMU IOMMU driver for VFIO"
> +
> +struc

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-04 Thread Alex Williamson
On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777 wrote:
> 
> > -Original Message-
> > From: linux-pci-ow...@vger.kernel.org 
> > [mailto:linux-pci-ow...@vger.kernel.org]
> > On Behalf Of Alex Williamson
> > Sent: Wednesday, September 25, 2013 10:16 PM
> > To: Bhushan Bharat-R65777
> > Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
> > linux-
> > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org; Bhushan Bharat-R65777
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > This api return the iommu domain to which the device is attached.
> > > The iommu_domain is required for making API calls related to iommu.
> > > Follow up patches which use this API to know iommu maping.
> > >
> > > Signed-off-by: Bharat Bhushan 
> > > ---
> > >  drivers/iommu/iommu.c |   10 ++
> > >  include/linux/iommu.h |7 +++
> > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index
> > > fbe9ca7..6ac5f50 100644
> > > --- a/drivers/iommu/iommu.c
> > > +++ b/drivers/iommu/iommu.c
> > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain
> > > *domain, struct device *dev)  }
> > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > >
> > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > + struct iommu_ops *ops = dev->bus->iommu_ops;
> > > +
> > > + if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > + return NULL;
> > > +
> > > + return ops->get_dev_iommu_domain(dev); }
> > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > 
> > What prevents this from racing iommu_domain_free()?  There's no references
> > acquired, so there's no reason for the caller to assume the pointer is 
> > valid.
> 
> Sorry for late query, somehow this email went into a folder and escaped;
> 
> Just to be sure, there is not lock at generic "struct iommu_domain", but IP 
> specific structure (link FSL domain) linked in iommu_domain->priv have a 
> lock, so we need to ensure this race in FSL iommu code (say 
> drivers/iommu/fsl_pamu_domain.c), right?

No, it's not sufficient to make sure that your use of the interface is
race free.  The interface itself needs to be designed so that it's
difficult to use incorrectly.  That's not the case here.  This is a
backdoor to get the iommu domain from the iommu driver regardless of who
is using it or how.  The iommu domain is created and managed by vfio, so
shouldn't we be looking at how to do this through vfio?  It seems like
you'd want to use your device to get a vfio group reference, from which
you could do something with the vfio external user interface and get the
iommu domain reference.  Thanks,

Alex
 
> > >  /*
> > >   * IOMMU groups are really the natrual working unit of the IOMMU, but
> > >   * the IOMMU API works on domains and devices.  Bridge that gap by
> > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h index
> > > 7ea319e..fa046bd 100644
> > > --- a/include/linux/iommu.h
> > > +++ b/include/linux/iommu.h
> > > @@ -127,6 +127,7 @@ struct iommu_ops {
> > >   int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
> > >   /* Get the numer of window per domain */
> > >   u32 (*domain_get_windows)(struct iommu_domain *domain);
> > > + struct iommu_domain *(*get_dev_iommu_domain)(struct device *dev);
> > >
> > >   unsigned long pgsize_bitmap;
> > >  };
> > > @@ -190,6 +191,7 @@ extern int iommu_domain_window_enable(struct 
> > > iommu_domain
> > *domain, u32 wnd_nr,
> > > phys_addr_t offset, u64 size,
> > > int prot);
> > >  extern void iommu_domain_window_disable(struct iommu_domain *domain,
> > > u32 wnd_nr);
> > > +extern struct iommu_domain *iommu_get_dev_domain(struct device *dev);
> > >  /**
> > >   * report_iommu_fault() - report about an IOMMU fault to the IOMMU 
> > > framework
> > >   * @domain: the iommu domain where the fault has happened @@ -284,6
> > > +286,11 @@ static inline void iommu_domain_window_disable(struct
> > > iommu_domain *domain,  {  }
> > >
> > > +static inline struct iommu_domain *iommu_get_dev_domain(struct device
> > > +*dev) {
> > > + return NULL;
> > > +}
> > > +
> > >  static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain
> > > *domain, dma_addr_t iova)  {
> > >   return 0;
> > 
> > 
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci" in the 
> > body
> > of a message to majord...@vger.kernel.org More majordomo info at
> > http://vger.kernel.org/majordomo-info.html
> 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH] PPC: KVM: vfio kvm device: support spapr tce

2013-10-04 Thread Alex Williamson
On Fri, 2013-10-04 at 22:24 +1000, Alexey Kardashevskiy wrote:
> This is a very rough change set required for ppc64 to use this KVM device.
> 
> vfio_rm.c is a piece of code which is going to be called from the realmode 
> (MMU off),
> and I will put everything spapr-related under #ifdef CONFIG_SPAPR_TCE_IOMMU,
> it is just friday and I have to run :)
> 
> This is an RFC but it works.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/kvm/Kconfig  |  1 +
>  arch/powerpc/kvm/Makefile |  4 
>  include/linux/kvm_host.h  |  8 ---
>  include/linux/vfio.h  |  3 +++
>  include/uapi/linux/kvm.h  |  1 +
>  virt/kvm/vfio.c   | 46 
>  virt/kvm/vfio_rm.c| 54 
> +++
>  7 files changed, 114 insertions(+), 3 deletions(-)
>  create mode 100644 virt/kvm/vfio_rm.c
> 
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 61b3535..d1b7f64 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -60,6 +60,7 @@ config KVM_BOOK3S_64
>   select KVM_BOOK3S_64_HANDLER
>   select KVM
>   select SPAPR_TCE_IOMMU
> + select KVM_VFIO
>   ---help---
> Support running unmodified book3s_64 and book3s_32 guest kernels
> in virtual machines on book3s_64 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 6646c95..fc2878b 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -55,6 +55,8 @@ kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
>  
>  kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
>   $(KVM)/coalesced_mmio.o \
> + $(KVM)/vfio.o \
> + $(KVM)/vfio_rm.o \
>   fpu.o \
>   book3s_paired_singles.o \
>   book3s_pr.o \
> @@ -76,6 +78,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
>  kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
>   book3s_hv_rm_xics.o
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
> + $(KVM)/vfio_rm.o \
>   book3s_hv_rmhandlers.o \
>   book3s_hv_rm_mmu.o \
>   book3s_64_vio_hv.o \
> @@ -89,6 +92,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
>  
>  kvm-book3s_64-module-objs := \
>   $(KVM)/kvm_main.o \
> + $(KVM)/vfio.o \
>   $(KVM)/eventfd.o \
>   powerpc.o \
>   emulate.o \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ad2b581..43c0290 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -407,6 +407,8 @@ struct kvm {
>  #endif
>   long tlbs_dirty;
>   struct list_head devices;
> +
> + struct kvm_vfio *vfio;


 can't this be on kvm->arch?

>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -677,15 +679,15 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
>  void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
>  bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
>  #else
> -static inline void kvm_arch_register_noncoherent_dma(void)
> +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
>  {
>  }
>  
> -static inline void kvm_arch_unregister_noncoherent_dma(void)
> +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
>  {
>  }
>  
> -static inline bool kvm_arch_has_noncoherent_dma(void)
> +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
>  {
>   return false;
>  }

Will fix in my series.

> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 24579a0..681e19b 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -97,4 +97,7 @@ extern struct vfio_group 
> *vfio_group_get_external_user(struct file *filep);
>  extern void vfio_group_put_external_user(struct vfio_group *group);
>  extern int vfio_external_user_iommu_id(struct vfio_group *group);
>  
> +extern struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm,
> + unsigned long liobn);
> +

Wrong header file.

>  #endif /* VFIO_H */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7c1a349..a74ad16 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -847,6 +847,7 @@ struct kvm_device_attr {
>  #define  KVM_DEV_VFIO_GROUP  1
>  #define   KVM_DEV_VFIO_GROUP_ADD 1
>  #define   KVM_DEV_VFIO_GROUP_DEL 2
> +#define  KVM_DEV_VFIO_SPAPR_TCE_LIOBN2
>  
>  /*
>   * ioctls for VM fds
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index 2e336a7..39dea9f 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -22,6 +22,7 @@
>  struct kvm_vfio_group {
>   struct list_head node;
>   struct vfio_group *vfio_group;
> + uint64_t liobn; /* sPAPR */

Perhaps an arch pointer or at least a union.

>  };
>  
>  struct kvm_vfio {
> @@ -188,12 +189,52 @@ static int kvm_vfio_set_group(struct kvm_device *dev, 
> long attr, u64 arg)
>   return -ENXIO;
>  }
>  
> +static int kvm_vfio_set_spapr_tce_liobn(struc

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-04 Thread Alex Williamson
On Fri, 2013-10-04 at 16:47 +, Bhushan Bharat-R65777 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Friday, October 04, 2013 9:15 PM
> > To: Bhushan Bharat-R65777
> > Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
> > linux-
> > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777 wrote:
> > >
> > > > -Original Message-
> > > > From: linux-pci-ow...@vger.kernel.org
> > > > [mailto:linux-pci-ow...@vger.kernel.org]
> > > > On Behalf Of Alex Williamson
> > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org;
> > > > Bhushan Bharat-R65777
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > > > This api return the iommu domain to which the device is attached.
> > > > > The iommu_domain is required for making API calls related to iommu.
> > > > > Follow up patches which use this API to know iommu maping.
> > > > >
> > > > > Signed-off-by: Bharat Bhushan 
> > > > > ---
> > > > >  drivers/iommu/iommu.c |   10 ++
> > > > >  include/linux/iommu.h |7 +++
> > > > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > > > >
> > > > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index
> > > > > fbe9ca7..6ac5f50 100644
> > > > > --- a/drivers/iommu/iommu.c
> > > > > +++ b/drivers/iommu/iommu.c
> > > > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct iommu_domain
> > > > > *domain, struct device *dev)  }
> > > > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > > > >
> > > > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > > > + struct iommu_ops *ops = dev->bus->iommu_ops;
> > > > > +
> > > > > + if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > > > + return NULL;
> > > > > +
> > > > > + return ops->get_dev_iommu_domain(dev); }
> > > > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > > >
> > > > What prevents this from racing iommu_domain_free()?  There's no
> > > > references acquired, so there's no reason for the caller to assume the
> > pointer is valid.
> > >
> > > Sorry for late query, somehow this email went into a folder and
> > > escaped;
> > >
> > > Just to be sure, there is not lock at generic "struct iommu_domain", but 
> > > IP
> > specific structure (link FSL domain) linked in iommu_domain->priv have a 
> > lock,
> > so we need to ensure this race in FSL iommu code (say
> > drivers/iommu/fsl_pamu_domain.c), right?
> > 
> > No, it's not sufficient to make sure that your use of the interface is race
> > free.  The interface itself needs to be designed so that it's difficult to 
> > use
> > incorrectly.
> 
> So we can define iommu_get_dev_domain()/iommu_put_dev_domain();
> iommu_get_dev_domain() will return domain with the lock held, and
> iommu_put_dev_domain() will release the lock? And
> iommu_get_dev_domain() must always be followed by
> iommu_get_dev_domain().

What lock?  get/put are generally used for reference counting, not
locking in the kernel.

> > That's not the case here.  This is a backdoor to get the iommu
> > domain from the iommu driver regardless of who is using it or how.  The 
> > iommu
> > domain is created and managed by vfio, so shouldn't we be looking at how to 
> > do
> > this through vfio?
> 
> Let me first describe what we are doing here:
> During initialization:-
>  - vfio talks to MSI system to know the MSI-page and size

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-04 Thread Alex Williamson
On Fri, 2013-10-04 at 17:23 +, Bhushan Bharat-R65777 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Friday, October 04, 2013 10:43 PM
> > To: Bhushan Bharat-R65777
> > Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
> > linux-
> > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Fri, 2013-10-04 at 16:47 +, Bhushan Bharat-R65777 wrote:
> > >
> > > > -Original Message-
> > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > Sent: Friday, October 04, 2013 9:15 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777 wrote:
> > > > >
> > > > > > -Original Message-
> > > > > > From: linux-pci-ow...@vger.kernel.org
> > > > > > [mailto:linux-pci-ow...@vger.kernel.org]
> > > > > > On Behalf Of Alex Williamson
> > > > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > > > To: Bhushan Bharat-R65777
> > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > foundation.org; Bhushan Bharat-R65777
> > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > > > device
> > > > > >
> > > > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > > > > > This api return the iommu domain to which the device is attached.
> > > > > > > The iommu_domain is required for making API calls related to 
> > > > > > > iommu.
> > > > > > > Follow up patches which use this API to know iommu maping.
> > > > > > >
> > > > > > > Signed-off-by: Bharat Bhushan 
> > > > > > > ---
> > > > > > >  drivers/iommu/iommu.c |   10 ++
> > > > > > >  include/linux/iommu.h |7 +++
> > > > > > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > > > > > > index
> > > > > > > fbe9ca7..6ac5f50 100644
> > > > > > > --- a/drivers/iommu/iommu.c
> > > > > > > +++ b/drivers/iommu/iommu.c
> > > > > > > @@ -696,6 +696,16 @@ void iommu_detach_device(struct
> > > > > > > iommu_domain *domain, struct device *dev)  }
> > > > > > > EXPORT_SYMBOL_GPL(iommu_detach_device);
> > > > > > >
> > > > > > > +struct iommu_domain *iommu_get_dev_domain(struct device *dev) {
> > > > > > > + struct iommu_ops *ops = dev->bus->iommu_ops;
> > > > > > > +
> > > > > > > + if (unlikely(ops == NULL || ops->get_dev_iommu_domain == NULL))
> > > > > > > + return NULL;
> > > > > > > +
> > > > > > > + return ops->get_dev_iommu_domain(dev); }
> > > > > > > +EXPORT_SYMBOL_GPL(iommu_get_dev_domain);
> > > > > >
> > > > > > What prevents this from racing iommu_domain_free()?  There's no
> > > > > > references acquired, so there's no reason for the caller to
> > > > > > assume the
> > > > pointer is valid.
> > > > >
> > > > > Sorry for late query, somehow this email went into a folder and
> > > > > escaped;
> > > > >
> > > > > Just to be sure, ther

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-07 Thread Alex Williamson
On Mon, 2013-10-07 at 05:46 +, Bhushan Bharat-R65777 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Friday, October 04, 2013 11:42 PM
> > To: Bhushan Bharat-R65777
> > Cc: j...@8bytes.org; b...@kernel.crashing.org; ga...@kernel.crashing.org; 
> > linux-
> > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-
> > p...@vger.kernel.org; ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > foundation.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Fri, 2013-10-04 at 17:23 +, Bhushan Bharat-R65777 wrote:
> > >
> > > > -Original Message-
> > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > Sent: Friday, October 04, 2013 10:43 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Fri, 2013-10-04 at 16:47 +, Bhushan Bharat-R65777 wrote:
> > > > >
> > > > > > -Original Message-
> > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > Sent: Friday, October 04, 2013 9:15 PM
> > > > > > To: Bhushan Bharat-R65777
> > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > foundation.org
> > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > > > device
> > > > > >
> > > > > > On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777 wrote:
> > > > > > >
> > > > > > > > -Original Message-
> > > > > > > > From: linux-pci-ow...@vger.kernel.org
> > > > > > > > [mailto:linux-pci-ow...@vger.kernel.org]
> > > > > > > > On Behalf Of Alex Williamson
> > > > > > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > > > > > To: Bhushan Bharat-R65777
> > > > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > > > foundation.org; Bhushan Bharat-R65777
> > > > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain
> > > > > > > > of a device
> > > > > > > >
> > > > > > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> > > > > > > > > This api return the iommu domain to which the device is 
> > > > > > > > > attached.
> > > > > > > > > The iommu_domain is required for making API calls related to
> > iommu.
> > > > > > > > > Follow up patches which use this API to know iommu maping.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Bharat Bhushan
> > > > > > > > > 
> > > > > > > > > ---
> > > > > > > > >  drivers/iommu/iommu.c |   10 ++
> > > > > > > > >  include/linux/iommu.h |7 +++
> > > > > > > > >  2 files changed, 17 insertions(+), 0 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > > > > > > > > index
> > > > > > > > > fbe9ca7..6ac5f50 100644
> > > > > > > > > --- a/drivers/iommu/iommu.c
> > > > > > > > > +++ b/drivers/iommu/iommu.c
> > > > > > > > > @@ -696,6 +696,16 @@ vo

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-10 Thread Alex Williamson
On Thu, 2013-10-10 at 20:09 +, Sethi Varun-B16395 wrote:
> 
> > -Original Message-
> > From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
> > boun...@lists.linux-foundation.org] On Behalf Of Alex Williamson
> > Sent: Tuesday, October 08, 2013 8:43 AM
> > To: Bhushan Bharat-R65777
> > Cc: ag...@suse.de; Wood Scott-B07421; linux-...@vger.kernel.org;
> > ga...@kernel.crashing.org; linux-ker...@vger.kernel.org;
> > io...@lists.linux-foundation.org; b...@kernel.crashing.org; linuxppc-
> > d...@lists.ozlabs.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Mon, 2013-10-07 at 05:46 +, Bhushan Bharat-R65777 wrote:
> > >
> > > > -Original Message-
> > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > Sent: Friday, October 04, 2013 11:42 PM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux- foundation.org
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Fri, 2013-10-04 at 17:23 +, Bhushan Bharat-R65777 wrote:
> > > > >
> > > > > > -Original Message-
> > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > Sent: Friday, October 04, 2013 10:43 PM
> > > > > > To: Bhushan Bharat-R65777
> > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > foundation.org
> > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > > > device
> > > > > >
> > > > > > On Fri, 2013-10-04 at 16:47 +, Bhushan Bharat-R65777 wrote:
> > > > > > >
> > > > > > > > -Original Message-
> > > > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > > > Sent: Friday, October 04, 2013 9:15 PM
> > > > > > > > To: Bhushan Bharat-R65777
> > > > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > > > foundation.org
> > > > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain
> > > > > > > > of a device
> > > > > > > >
> > > > > > > > On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777
> > wrote:
> > > > > > > > >
> > > > > > > > > > -Original Message-
> > > > > > > > > > From: linux-pci-ow...@vger.kernel.org
> > > > > > > > > > [mailto:linux-pci-ow...@vger.kernel.org]
> > > > > > > > > > On Behalf Of Alex Williamson
> > > > > > > > > > Sent: Wednesday, September 25, 2013 10:16 PM
> > > > > > > > > > To: Bhushan Bharat-R65777
> > > > > > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > > > > > ga...@kernel.crashing.org; linux-
> > > > > > > > > > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org;
> > > > > > > > > > linux- p...@vger.kernel.org; ag...@suse.de; Wood
> > > > > > > > > > Scott-B07421; iommu@lists.linux- foundation.org; Bhushan
> > > > > > > > > > Bharat-R65777
> > > > > > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get
> > > > > > > > > > iommu_domain of a device
> > > > > > > > > >
> > > > > > > > > > On Thu, 2013-09-19 at 12:59 +0530, Bharat Bhushan wrote:
> &

Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device

2013-10-14 Thread Alex Williamson
On Mon, 2013-10-14 at 12:58 +, Sethi Varun-B16395 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Friday, October 11, 2013 2:12 AM
> > To: Sethi Varun-B16395
> > Cc: Bhushan Bharat-R65777; ag...@suse.de; Wood Scott-B07421; linux-
> > p...@vger.kernel.org; ga...@kernel.crashing.org; linux-
> > ker...@vger.kernel.org; io...@lists.linux-foundation.org;
> > b...@kernel.crashing.org; linuxppc-dev@lists.ozlabs.org
> > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a device
> > 
> > On Thu, 2013-10-10 at 20:09 +, Sethi Varun-B16395 wrote:
> > >
> > > > -Original Message-
> > > > From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
> > > > boun...@lists.linux-foundation.org] On Behalf Of Alex Williamson
> > > > Sent: Tuesday, October 08, 2013 8:43 AM
> > > > To: Bhushan Bharat-R65777
> > > > Cc: ag...@suse.de; Wood Scott-B07421; linux-...@vger.kernel.org;
> > > > ga...@kernel.crashing.org; linux-ker...@vger.kernel.org;
> > > > io...@lists.linux-foundation.org; b...@kernel.crashing.org;
> > > > linuxppc- d...@lists.ozlabs.org
> > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > device
> > > >
> > > > On Mon, 2013-10-07 at 05:46 +, Bhushan Bharat-R65777 wrote:
> > > > >
> > > > > > -Original Message-
> > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > Sent: Friday, October 04, 2013 11:42 PM
> > > > > > To: Bhushan Bharat-R65777
> > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > foundation.org
> > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain of a
> > > > > > device
> > > > > >
> > > > > > On Fri, 2013-10-04 at 17:23 +, Bhushan Bharat-R65777 wrote:
> > > > > > >
> > > > > > > > -Original Message-
> > > > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > > > Sent: Friday, October 04, 2013 10:43 PM
> > > > > > > > To: Bhushan Bharat-R65777
> > > > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > > > ga...@kernel.crashing.org; linux- ker...@vger.kernel.org;
> > > > > > > > linuxppc-dev@lists.ozlabs.org; linux- p...@vger.kernel.org;
> > > > > > > > ag...@suse.de; Wood Scott-B07421; iommu@lists.linux-
> > > > > > > > foundation.org
> > > > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get iommu_domain
> > > > > > > > of a device
> > > > > > > >
> > > > > > > > On Fri, 2013-10-04 at 16:47 +, Bhushan Bharat-R65777
> > wrote:
> > > > > > > > >
> > > > > > > > > > -Original Message-
> > > > > > > > > > From: Alex Williamson
> > > > > > > > > > [mailto:alex.william...@redhat.com]
> > > > > > > > > > Sent: Friday, October 04, 2013 9:15 PM
> > > > > > > > > > To: Bhushan Bharat-R65777
> > > > > > > > > > Cc: j...@8bytes.org; b...@kernel.crashing.org;
> > > > > > > > > > ga...@kernel.crashing.org; linux-
> > > > > > > > > > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org;
> > > > > > > > > > linux- p...@vger.kernel.org; ag...@suse.de; Wood
> > > > > > > > > > Scott-B07421; iommu@lists.linux- foundation.org
> > > > > > > > > > Subject: Re: [PATCH 2/7] iommu: add api to get
> > > > > > > > > > iommu_domain of a device
> > > > > > > > > >
> > > > > > > > > > On Fri, 2013-10-04 at 09:54 +, Bhushan Bharat-R65777
> > > > wrote:
> > > > > > > > > > >
> > > > > > > > > > > > --

Re: [RFC PATCH v2] KVM: PPC: vfio kvm device: support spapr tce

2013-11-05 Thread Alex Williamson
On Tue, 2013-11-05 at 19:05 +1100, Alexey Kardashevskiy wrote:
> Signed-off-by: Alexey Kardashevskiy 
> ---
> 
> Changes:
> v2:
> * it does not try to introduce a realmode search function.
> Instead, liobn-to-iommu-group lookup is done by VFIO KVM device
> in virtual mode and the result (iommu_group pointer) is cached
> in kvm_arch so the realmode handlers do not use VFIO KVM device for that.
> And the iommu groups get released on KVM termination.
> 
> I tried this, seems viable.
> 
> Did not I miss anything? Thanks.

A commit message ;)

> ---
>  arch/powerpc/include/asm/kvm_host.h |  3 ++
>  arch/powerpc/kvm/Kconfig|  1 +
>  arch/powerpc/kvm/Makefile   |  3 ++
>  include/linux/vfio.h|  3 ++
>  include/uapi/linux/kvm.h|  1 +
>  virt/kvm/vfio.c | 74 
> +
>  6 files changed, 85 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 48dbe8b..e1163d7 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -293,6 +293,9 @@ struct kvm_arch {
>  #ifdef CONFIG_KVM_XICS
>   struct kvmppc_xics *xics;
>  #endif
> +#ifdef CONFIG_KVM_VFIO
> + struct kvm_vfio *vfio;
> +#endif
>  };
>  
>  /*
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 61b3535..d1b7f64 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -60,6 +60,7 @@ config KVM_BOOK3S_64
>   select KVM_BOOK3S_64_HANDLER
>   select KVM
>   select SPAPR_TCE_IOMMU
> + select KVM_VFIO
>   ---help---
> Support running unmodified book3s_64 and book3s_32 guest kernels
> in virtual machines on book3s_64 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 6646c95..2438d2e 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -87,6 +87,9 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
>  kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
>   book3s_xics.o
>  
> +kvm-book3s_64-objs-$(CONFIG_KVM_VFIO) += \
> + $(KVM)/vfio.o \
> +
>  kvm-book3s_64-module-objs := \
>   $(KVM)/kvm_main.o \
>   $(KVM)/eventfd.o \
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 24579a0..681e19b 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -97,4 +97,7 @@ extern struct vfio_group 
> *vfio_group_get_external_user(struct file *filep);
>  extern void vfio_group_put_external_user(struct vfio_group *group);
>  extern int vfio_external_user_iommu_id(struct vfio_group *group);
>  
> +extern struct iommu_group *vfio_find_group_by_liobn(struct kvm *kvm,
> + unsigned long liobn);
> +

Nope, this doesn't go in vfio.h, it's a function provided by kvm.  It
should be named as such too, kvm_vfio_...  It also depends on both
CONFIG_KVM_VFIO and CONFIG_SPAPR_TCE_IOMMU and needs stub version
otherwise.  Is just _liobn specific enough or does it need a spapr_tce
thrown in to avoid confusion with embedded ppc folks?

>  #endif /* VFIO_H */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7c1a349..a74ad16 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -847,6 +847,7 @@ struct kvm_device_attr {
>  #define  KVM_DEV_VFIO_GROUP  1
>  #define   KVM_DEV_VFIO_GROUP_ADD 1
>  #define   KVM_DEV_VFIO_GROUP_DEL 2
> +#define  KVM_DEV_VFIO_SPAPR_TCE_LIOBN2

I wonder if it would be better architecturally if this was an attribute
rather than a new group, ex:

#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE_LIOBN   3

It's a mouthful, but we are setting an attribute of a VFIO group, so it
makes sense.  kvm_device_attr.addr would then need to point to a struct
containing both the fd and liobn.

Whatever we come up with need a documentation addition in
Documentation/virtual/kvm/devices/vfio.txt.

>  
>  /*
>   * ioctls for VM fds
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index ca4260e..f9271d5 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -22,6 +22,9 @@
>  struct kvm_vfio_group {
>   struct list_head node;
>   struct vfio_group *vfio_group;
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> + uint64_t liobn;

Why is liobn an unsigned long in the exported function but a uint64_t
here?

> +#endif
>  };
>  
>  struct kvm_vfio {
> @@ -188,12 +191,76 @@ static int kvm_vfio_set_group(struct kvm_device *dev, 
> long attr, u64 arg)
>   return -ENXIO;
>  }
>  
> +#ifdef CONFIG_SPAPR_TCE_IOMMU
> +static int kvm_vfio_set_spapr_tce_liobn(struct kvm_device *dev,
> + long attr, u64 arg)
> +{
> + struct kvm_vfio *kv = dev->private;
> + struct vfio_group *vfio_group;
> + struct kvm_vfio_group *kvg;
> + void __user *argp = (void __user *)arg;
> + struct fd f;
> + int32_t fd;
> + uint64_t liobn = attr;
> +
> + if (get_user(fd,

Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO

2012-12-07 Thread Alex Williamson
On Fri, 2012-12-07 at 18:34 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.
> 
> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  drivers/vfio/Kconfig|6 +
>  drivers/vfio/Makefile   |1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  348 
> +++
>  include/linux/vfio.h|   30 +++
>  4 files changed, 385 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>   depends on VFIO
>   default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IOMMU
> + default n
> +
>  menuconfig VFIO
>   tristate "VFIO Non-Privileged userspace driver framework"
>   depends on IOMMU_API
>   select VFIO_IOMMU_TYPE1 if X86
> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>   help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 000..b0f81fe
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,348 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> + * Author: Alexey Kardashevskiy 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + * Author: Alex Williamson 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "a...@ozlabs.ru"
> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + */
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> + struct mm_struct*mm;
> + longnpage;
> + struct work_struct  work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> + struct vwork *vwork = container_of(work, struct vwork, work);
> + struct mm_struct *mm;
> +
> + mm = vwork->mm;
> + down_write(&mm->mmap_sem);
> + mm->locked_vm += vwork->npage;
> + up_write(&mm->mmap_sem);
> + mmput(mm);
> + kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> + struct vwork *vwork;
> + struct mm_struct *mm;
> +
> + if (!current->mm)
> + return; /* process exited */
> +
> + if (down_write_trylock(¤t->mm->mmap_sem)) {
> + current->mm->locked_vm += npage;
> + up_write(¤t->mm->mmap_sem);
> + return;
> + }
> +
> + /*
> +  * Couldn't get mmap_sem lock, so must setup to update
> +  * mm->locked_vm later. If locked_vm were atomic, we
> +  * wouldn't need this silliness
> +  */
> + v

Re: [PATCH] vfio powerpc: enabled on powernv platform

2012-12-07 Thread Alex Williamson
On Fri, 2012-12-07 at 18:35 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/iommu.h |   10 ++
>  arch/powerpc/kernel/iommu.c  |  214 
> ++
>  arch/powerpc/platforms/powernv/pci.c |  134 +
>  drivers/iommu/Kconfig|8 ++
>  4 files changed, 366 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index cbfe678..be3b11b 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>   struct iommu_pool large_pool;
>   struct iommu_pool pools[IOMMU_NR_POOLS];
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool release);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..123431a 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, 
> size_t size,
>   free_pages((unsigned long)vaddr, get_order(size));
>   }
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool release)
> +{
> + /*
> +  * Page at 0 is marked as used in iommu_init_table,
> +  * so here we clear it when called with release=false...
> +  */
> + if (!release && (tbl->it_offset == 0))
> + clear_bit(0, tbl->it_map);

Isn't this redundant to the memset below?

> +
> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +
> + memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
> +
> + /*
> +  * ... or restore when release=true
> +  */
> + if (release && (tbl->it_offset == 0))
> + set_bit(0, tbl->it_map);

"release" to me implies something is freed, maybe this should just be
called "restore".

> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + * "offset" is an IOMMU page number relative to DMA window start.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long offset)
> +{
> + int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> + /* Aligns TCE entry number to system page boundary */
> + offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> + /* Count used 4K pages */
> + while (nbits) {
> + if (test_bit(offset, map))
> + ++ret;
> + --nbits;
> + ++offset;
> + }
> +
> + return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> + /* Flush/invalidate TLB caches if necessary */
> + if (ppc_md.tce_flush)
> + ppc_md.tce_flush(tbl);
> +
> + /* Make sure updates are seen by hardware */
> + mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> + unsigned long pages)
> +{
> + int i, retpages = 0, clr;
> + unsigned long oldtce, oldweight;
> + struct page *page;
> +
> + for (i = 0; i < pages; ++i) {

Any reason not to increment "entry" and avoid the 5 cases of "entry + i"
below?

> + if (!test_bit(entr

Re: [PATCH] vfio powerpc: enabled on powernv platform

2012-12-12 Thread Alex Williamson
On Wed, 2012-12-12 at 17:14 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:38, Alex Williamson wrote:
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +  struct pci_dev *pdev = NULL;
> >> +  struct iommu_table *tbl;
> >> +  struct iommu_group *grp;
> >> +
> >> +  /* Allocate and initialize IOMMU groups */
> >> +  for_each_pci_dev(pdev) {
> >> +  tbl = get_iommu_table_base(&pdev->dev);
> >> +  if (!tbl)
> >> +  continue;
> >> +
> >> +  /* Skip already initialized */
> >> +  if (tbl->it_group)
> >> +  continue;
> >> +
> >> +  grp = iommu_group_alloc();
> >> +  if (IS_ERR(grp)) {
> >> +  pr_info("tce_vfio: cannot create new IOMMU group, 
> >> ret=%ld\n",
> >> +  PTR_ERR(grp));
> >> +  return PTR_ERR(grp);
> >> +  }
> >> +  tbl->it_group = grp;
> >> +  iommu_group_set_iommudata(grp, tbl, group_release);
> >
> > BTW, groups have a name property that shows up in sysfs that can be set
> > with iommu_group_set_name().  IIRC, this was a feature David requested
> > for PEs.  It'd be nice if it was used for PEs...  Thanks,
> 
> 
> 
> But what would I put there?... IOMMU ID is more than enough at the moment 
> and struct iommu_table does not have anything what would have made sense to 
> show in the sysfs...

I believe David mentioned that PEs had user visible names.  Perhaps they
match an enclosure location or something.  Group numbers are rather
arbitrary and really have no guarantee of persistence.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] vfio powerpc: implemented IOMMU driver for VFIO

2012-12-12 Thread Alex Williamson
On Wed, 2012-12-12 at 17:59 +1100, Alexey Kardashevskiy wrote:
> On 08/12/12 04:01, Alex Williamson wrote:
> >> +  case VFIO_IOMMU_MAP_DMA: {
> >> +  vfio_iommu_spapr_tce_dma_map param;
> >> +  struct iommu_table *tbl = container->tbl;
> >> +  enum dma_data_direction direction;
> >> +  unsigned long locked, lock_limit;
> >> +
> >> +  if (WARN_ON(!tbl))
> >> +  return -ENXIO;
> >> +
> >> +  minsz = offsetofend(vfio_iommu_spapr_tce_dma_map, size);
> >> +
> >> +  if (copy_from_user(¶m, (void __user *)arg, minsz))
> >> +  return -EFAULT;
> >> +
> >> +  if (param.argsz < minsz)
> >> +  return -EINVAL;
> >> +
> >> +  if ((param.flags & VFIO_DMA_MAP_FLAG_READ) &&
> >> +  (param.flags & VFIO_DMA_MAP_FLAG_WRITE))
> >> +  direction = DMA_BIDIRECTIONAL;
> >> +  else if (param.flags & VFIO_DMA_MAP_FLAG_READ)
> >> +  direction = DMA_TO_DEVICE;
> >> +  else if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
> >> +  direction = DMA_FROM_DEVICE;
> >> +  else
> >> +  return -EINVAL;
> >
> > flags needs to be sanitized too.  Return EINVAL if any unknown bit is
> > set or else sloppy users may make it very difficult to make use of those
> > flag bits later.
> 
> 
> It already returns -EINVAL on any bit set except READ/WRITE, no?

No.  I could pass flags ~0 through there to get a read/write mapping and
cause you problems if you later want to define another bit.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] vfio powerpc: enabled on powernv platform

2012-12-12 Thread Alex Williamson
On Wed, 2012-12-12 at 23:34 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/include/asm/iommu.h |   10 ++
>  arch/powerpc/kernel/iommu.c  |  329 
> ++
>  arch/powerpc/platforms/powernv/pci.c |  134 ++
>  drivers/iommu/Kconfig|8 +
>  4 files changed, 481 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index cbfe678..3c861ae 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>   struct iommu_pool large_pool;
>   struct iommu_pool pools[IOMMU_NR_POOLS];
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool restore);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long ioba,
> + unsigned long size);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long ioba,
> + uint64_t tce, enum dma_data_direction direction,
> + unsigned long size);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..f3bb2e7 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -36,6 +36,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -44,6 +45,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define DBG(...)
>  
> @@ -856,3 +858,330 @@ void iommu_free_coherent(struct iommu_table *tbl, 
> size_t size,
>   free_pages((unsigned long)vaddr, get_order(size));
>   }
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +struct vwork {
> + struct mm_struct*mm;
> + longnpage;
> + struct work_struct  work;
> +};
> +
> +/* delayed decrement/increment for locked_vm */
> +static void lock_acct_bg(struct work_struct *work)
> +{
> + struct vwork *vwork = container_of(work, struct vwork, work);
> + struct mm_struct *mm;
> +
> + mm = vwork->mm;
> + down_write(&mm->mmap_sem);
> + mm->locked_vm += vwork->npage;
> + up_write(&mm->mmap_sem);
> + mmput(mm);
> + kfree(vwork);
> +}
> +
> +static void lock_acct(long npage)
> +{
> + struct vwork *vwork;
> + struct mm_struct *mm;
> +
> + if (!current->mm)
> + return; /* process exited */
> +
> + if (down_write_trylock(¤t->mm->mmap_sem)) {
> + current->mm->locked_vm += npage;
> + up_write(¤t->mm->mmap_sem);
> + return;
> + }
> +
> + /*
> +  * Couldn't get mmap_sem lock, so must setup to update
> +  * mm->locked_vm later. If locked_vm were atomic, we
> +  * wouldn't need this silliness
> +  */
> + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> + if (!vwork)
> + return;
> + mm = get_task_mm(current);
> + if (!mm) {
> + kfree(vwork);
> + return;
> + }
> + INIT_WORK(&vwork->work, lock_acct_bg);
> + vwork->mm = mm;
> + vwork->npage = npage;
> + schedule_work(&vwork->work);
> +}

Locked page accounting in this version is very, very broken.  How do
powerpc folks feel about seemingly generic kernel iommu interfaces
messing with the current task mm?  Besides that, more problems below...

> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used.
> + *
> + * restore==true says to bring the iommu_table into the state as it was
> + * before being used by VFIO.
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool restore)
> +{
> + /* Page#0 is marked as used in iommu_init_table, so we clear it... */
> + if (!restore && (tbl->it_offset == 0))
> + clear_bit(0, tbl->it_map);
> +
> + iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);

This does locked page accounting and unpins pages, even on startup when
the pages ar

Re: [PATCH] vfio powerpc: enabled on powernv platform

2012-12-12 Thread Alex Williamson
On Thu, 2012-12-13 at 13:57 +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2012-12-12 at 16:30 -0700, Alex Williamson wrote:
> > Locked page accounting in this version is very, very broken.  How do
> > powerpc folks feel about seemingly generic kernel iommu interfaces
> > messing with the current task mm?  Besides that, more problems
> > below...
> 
> After a second look & thought...
> 
> This whole accounting business is fucked. First, we simply can't just
> randomly return errors from H_PUT_TCE because the process reached some
> rlimit. This is not a proper failure mode. That means that the guest
> will probably panic() ... possibly right in the middle of some disk
> writeback or god knows what. Not good.
> 
> Also the overhead of doing all that crap on every TCE map/unmap is
> ridiculous.
> 
> Finally, it's just not going to work for real mode which we really want,
> since we can't take the mmap-sem in real mode anyway, so unless we
> convert that counter to an atomic, we can't do it.
> 
> I'd suggest just not bothering, or if you want to bother, check once
> when creating a TCE table that the rlimit is enough to bolt as many
> pages as can be populated in that table and fail to create *that*. The
> failure mode is much better, ie, qemu failing to create a PCI bus due to
> insufficient rlimits.

I agree, we don't seem to be headed in the right direction.  x86 needs
to track rlimits or else a user can exploit the interface to pin all the
memory in the system.  On power, only the iova window can be pinned, so
it's a fixed amount.  I could see it as granting access to a group
implicitly grants access to pinning the iova window.  We can still make
it more explicit by handling the rlimit accounting upfront.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform

2013-02-11 Thread Alex Williamson
On Mon, 2013-02-11 at 22:54 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> The iommu_put_tce_user_mode() does only a single page mapping
> as an API for adding many mappings at once is going to be
> added later.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables. As h_put_tce hypercall is received by the host
> kernel and processed by the QEMU (what involves calling
> the host kernel again), performance is not the best -
> circa 220MB/s on 10Gb ethernet network.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 

Yay, it's not dead! ;)

I'd love some kind of changelog to know what to look for in here,
especially given 2mo since the last version.

> ---
>  arch/powerpc/include/asm/iommu.h|   15 ++
>  arch/powerpc/kernel/iommu.c |  343 
> +++
>  arch/powerpc/platforms/powernv/pci-ioda.c   |1 +
>  arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +-
>  arch/powerpc/platforms/powernv/pci.c|3 +
>  drivers/iommu/Kconfig   |8 +
>  6 files changed, 374 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index cbfe678..900294b 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>   struct iommu_pool large_pool;
>   struct iommu_pool pools[IOMMU_NR_POOLS];
>   unsigned long *it_map;   /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> + struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, 
> const char *node_name);
>   */
>  extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
>   int nid);
> +extern void iommu_register_group(struct iommu_table * tbl,
> +  int domain_number, unsigned long pe_num);
>  
>  extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
>   struct scatterlist *sglist, int nelems,
> @@ -147,5 +152,15 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +/* The API to support IOMMU operations for VFIO */
> +extern long iommu_clear_tce_user_mode(struct iommu_table *tbl,
> + unsigned long ioba, unsigned long tce_value,
> + unsigned long npages);
> +extern long iommu_put_tce_user_mode(struct iommu_table *tbl,
> + unsigned long ioba, unsigned long tce);
> +
> +extern void iommu_flush_tce(struct iommu_table *tbl);
> +extern long iommu_lock_table(struct iommu_table *tbl, bool lock);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 7c309fe..b4fdabc 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -37,6 +37,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -45,6 +46,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define DBG(...)
>  
> @@ -707,11 +709,39 @@ struct iommu_table *iommu_init_table(struct iommu_table 
> *tbl, int nid)
>   return tbl;
>  }
>  
> +static void group_release(void *iommu_data)
> +{
> + struct iommu_table *tbl = iommu_data;
> + tbl->it_group = NULL;
> +}
> +
> +void iommu_register_group(struct iommu_table * tbl,
> + int domain_number, unsigned long pe_num)
> +{
> + struct iommu_group *grp;
> +
> + grp = iommu_group_alloc();
> + if (IS_ERR(grp)) {
> + pr_info("powerpc iommu api: cannot create new group, err=%ld\n",
> + PTR_ERR(grp));
> + return;
> + }
> + tbl->it_group = grp;
> + iommu_group_set_iommudata(grp, tbl, group_release);
> + iommu_group_set_name(grp, kasprintf(GFP_KERNEL, "domain%d-pe%lx",
> + domain_number, pe_num));
> +}
> +
>  void iommu_free_table(struct iommu_table *tbl, const char *node_name)
>  {
>   unsigned long bitmap_sz;
>   unsigned int order;
>  
> + if (tbl && tbl->it_group) {
> + iommu_group_put(tbl->it_group);
> + BUG_ON(tbl->it_group);
> + }
> +
>   if (!tbl || !tbl->it_map) {
>   printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,

Re: [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO

2013-02-11 Thread Alex Williamson
On Mon, 2013-02-11 at 22:54 +1100, Alexey Kardashevskiy wrote:
> VFIO implements platform independent stuff such as
> a PCI driver, BAR access (via read/write on a file descriptor
> or direct mapping when possible) and IRQ signaling.
> 
> The platform dependent part includes IOMMU initialization
> and handling. This patch implements an IOMMU driver for VFIO
> which does mapping/unmapping pages for the guest IO and
> provides information about DMA window (required by a POWERPC
> guest).
> 
> The counterpart in QEMU is required to support this functionality.

Revision info would be great here too.

> Cc: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  drivers/vfio/Kconfig|6 +
>  drivers/vfio/Makefile   |1 +
>  drivers/vfio/vfio_iommu_spapr_tce.c |  269 
> +++
>  include/uapi/linux/vfio.h   |   31 
>  4 files changed, 307 insertions(+)
>  create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index 7cd5dec..b464687 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
>   depends on VFIO
>   default n
>  
> +config VFIO_IOMMU_SPAPR_TCE
> + tristate
> + depends on VFIO && SPAPR_TCE_IOMMU
> + default n
> +
>  menuconfig VFIO
>   tristate "VFIO Non-Privileged userspace driver framework"
>   depends on IOMMU_API
>   select VFIO_IOMMU_TYPE1 if X86
> + select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
>   help
> VFIO provides a framework for secure userspace device drivers.
> See Documentation/vfio.txt for more details.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 2398d4a..72bfabc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,3 +1,4 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> new file mode 100644
> index 000..9b3fa88
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -0,0 +1,269 @@
> +/*
> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> + *
> + * Copyright (C) 2012 IBM Corp.  All rights reserved.

2013 now

> + * Author: Alexey Kardashevskiy 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio_iommu_type1.c:
> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> + * Author: Alex Williamson 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "a...@ozlabs.ru"
> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> +
> +static void tce_iommu_detach_group(void *iommu_data,
> + struct iommu_group *iommu_group);
> +
> +/*
> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> + *
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +/*
> + * The container descriptor supports only a single group per container.
> + * Required by the API as the container is not supplied with the IOMMU group
> + * at the moment of initialization.
> + */
> +struct tce_container {
> + struct mutex lock;
> + struct iommu_table *tbl;
> +};
> +
> +static void *tce_iommu_open(unsigned long arg)
> +{
> + struct tce_container *container;
> +
> + if (arg != VFIO_SPAPR_TCE_IOMMU) {
> + pr_err("tce_vfio: Wrong IOMMU type\n");
> + return ERR_PTR(-EINVAL);
> + }
> +
> + container = kzalloc(sizeof(*container), GFP_KERNEL);
> + if (!container)
> + return ERR_PTR(-ENOMEM);
> +
> + mutex_init(&container->lock);
> +
> + return container;
> +}
> +
> +static void tce_iommu_release(void *iommu_data)
> +{
> + struct tce_container *container = iommu_data;
> +
> + WARN_ON(container->tbl && !container->tbl->it_group);
> + if (container->tbl && container->tbl->it_group)
> + tce_iommu_detach_group(iommu_data, container->tbl->it_group);
> +
> + mutex_destroy(&container->lock);
> +
> +

Re: [PATCH 1/2] vfio powerpc: enabled on powernv platform

2013-02-11 Thread Alex Williamson
On Tue, 2013-02-12 at 10:19 +1100, Alexey Kardashevskiy wrote:
> On 12/02/13 09:16, Alex Williamson wrote:
> > On Mon, 2013-02-11 at 22:54 +1100, Alexey Kardashevskiy wrote:
> >> @@ -707,11 +709,39 @@ struct iommu_table *iommu_init_table(struct 
> >> iommu_table *tbl, int nid)
> >>return tbl;
> >>   }
> >>
> >> +static void group_release(void *iommu_data)
> >> +{
> >> +  struct iommu_table *tbl = iommu_data;
> >> +  tbl->it_group = NULL;
> >> +}
> >> +
> >> +void iommu_register_group(struct iommu_table * tbl,
> >> +  int domain_number, unsigned long pe_num)
> >> +{
> >> +  struct iommu_group *grp;
> >> +
> >> +  grp = iommu_group_alloc();
> >> +  if (IS_ERR(grp)) {
> >> +  pr_info("powerpc iommu api: cannot create new group, err=%ld\n",
> >> +  PTR_ERR(grp));
> >> +  return;
> >> +  }
> >> +  tbl->it_group = grp;
> >> +  iommu_group_set_iommudata(grp, tbl, group_release);
> >> +  iommu_group_set_name(grp, kasprintf(GFP_KERNEL, "domain%d-pe%lx",
> >> +  domain_number, pe_num));
> >> +}
> >> +
> >>   void iommu_free_table(struct iommu_table *tbl, const char *node_name)
> >>   {
> >>unsigned long bitmap_sz;
> >>unsigned int order;
> >>
> >> +  if (tbl && tbl->it_group) {
> >> +  iommu_group_put(tbl->it_group);
> >> +  BUG_ON(tbl->it_group);
> >> +  }
> >> +
> >>if (!tbl || !tbl->it_map) {
> >>printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
> >>node_name);
> >> @@ -876,4 +906,317 @@ void kvm_iommu_unmap_pages(struct kvm *kvm, struct 
> >> kvm_memory_slot *slot)
> >>   {
> >>   }
> >>
> >> +static enum dma_data_direction tce_direction(unsigned long tce)
> >> +{
> >> +  if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
> >> +  return DMA_BIDIRECTIONAL;
> >> +  else if (tce & TCE_PCI_READ)
> >> +  return DMA_TO_DEVICE;
> >> +  else if (tce & TCE_PCI_WRITE)
> >> +  return DMA_FROM_DEVICE;
> >> +  else
> >> +  return DMA_NONE;
> >> +}
> >> +
> >> +void iommu_flush_tce(struct iommu_table *tbl)
> >> +{
> >> +  /* Flush/invalidate TLB caches if necessary */
> >> +  if (ppc_md.tce_flush)
> >> +  ppc_md.tce_flush(tbl);
> >> +
> >> +  /* Make sure updates are seen by hardware */
> >> +  mb();
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_flush_tce);
> >> +
> >> +static long tce_clear_param_check(struct iommu_table *tbl,
> >> +  unsigned long ioba, unsigned long tce_value,
> >> +  unsigned long npages)
> >> +{
> >> +  unsigned long size = npages << IOMMU_PAGE_SHIFT;
> >> +
> >> +  /* ppc_md.tce_free() does not support any value but 0 */
> >> +  if (tce_value)
> >> +  return -EINVAL;
> >> +
> >> +  if (ioba & ~IOMMU_PAGE_MASK)
> >> +  return -EINVAL;
> >> +
> >> +  if ((ioba + size) > ((tbl->it_offset + tbl->it_size)
> >> +  << IOMMU_PAGE_SHIFT))
> >> +  return -EINVAL;
> >> +
> >> +  if (ioba < (tbl->it_offset << IOMMU_PAGE_SHIFT))
> >> +  return -EINVAL;
> >> +
> >> +  return 0;
> >
> > Why do these all return long (vs int)?  Is this a POWER-ism?
> 
> No, not really but yeah, I picked it in powerpc code :) I tried to keep 
> them "long" but I noticed "int" below so what is the rule? Change all to int?

I'd say anything that's returning 0/-errno should probably be an int.

> >> +}
> >> +
> >> +static long tce_put_param_check(struct iommu_table *tbl,
> >> +  unsigned long ioba, unsigned long tce)
> >> +{
> >> +  if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >> +  return -EINVAL;
> >> +
> >> +  if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
> >> +  return -EINVAL;
> >> +
> >> +  if (ioba & ~IOMMU_PAGE_MASK)
> >> +  return -EINVAL;
> >> +
> >> +  if ((ioba + IOMMU_PAGE_SIZE) > ((tbl->it_offset + tbl->it_siz

Re: [PATCH 2/2] vfio powerpc: implemented IOMMU driver for VFIO

2013-02-11 Thread Alex Williamson
On Tue, 2013-02-12 at 10:45 +1100, Alexey Kardashevskiy wrote:
> On 12/02/13 09:17, Alex Williamson wrote:
> > On Mon, 2013-02-11 at 22:54 +1100, Alexey Kardashevskiy wrote:
> >> VFIO implements platform independent stuff such as
> >> a PCI driver, BAR access (via read/write on a file descriptor
> >> or direct mapping when possible) and IRQ signaling.
> >>
> >> The platform dependent part includes IOMMU initialization
> >> and handling. This patch implements an IOMMU driver for VFIO
> >> which does mapping/unmapping pages for the guest IO and
> >> provides information about DMA window (required by a POWERPC
> >> guest).
> >>
> >> The counterpart in QEMU is required to support this functionality.
> >
> > Revision info would be great here too.
>  >
> >
> >> Cc: David Gibson 
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >>   drivers/vfio/Kconfig|6 +
> >>   drivers/vfio/Makefile   |1 +
> >>   drivers/vfio/vfio_iommu_spapr_tce.c |  269 
> >> +++
> >>   include/uapi/linux/vfio.h   |   31 
> >>   4 files changed, 307 insertions(+)
> >>   create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c
> >>
> >> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> >> index 7cd5dec..b464687 100644
> >> --- a/drivers/vfio/Kconfig
> >> +++ b/drivers/vfio/Kconfig
> >> @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
> >>depends on VFIO
> >>default n
> >>
> >> +config VFIO_IOMMU_SPAPR_TCE
> >> +  tristate
> >> +  depends on VFIO && SPAPR_TCE_IOMMU
> >> +  default n
> >> +
> >>   menuconfig VFIO
> >>tristate "VFIO Non-Privileged userspace driver framework"
> >>depends on IOMMU_API
> >>select VFIO_IOMMU_TYPE1 if X86
> >> +  select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
> >>help
> >>  VFIO provides a framework for secure userspace device drivers.
> >>  See Documentation/vfio.txt for more details.
> >> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> >> index 2398d4a..72bfabc 100644
> >> --- a/drivers/vfio/Makefile
> >> +++ b/drivers/vfio/Makefile
> >> @@ -1,3 +1,4 @@
> >>   obj-$(CONFIG_VFIO) += vfio.o
> >>   obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
> >> +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> >>   obj-$(CONFIG_VFIO_PCI) += pci/
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> >> b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> new file mode 100644
> >> index 000..9b3fa88
> >> --- /dev/null
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -0,0 +1,269 @@
> >> +/*
> >> + * VFIO: IOMMU DMA mapping support for TCE on POWER
> >> + *
> >> + * Copyright (C) 2012 IBM Corp.  All rights reserved.
> >
> > 2013 now
> >
> >> + * Author: Alexey Kardashevskiy 
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License version 2 as
> >> + * published by the Free Software Foundation.
> >> + *
> >> + * Derived from original vfio_iommu_type1.c:
> >> + * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
> >> + * Author: Alex Williamson 
> >> + */
> >> +
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +
> >> +#define DRIVER_VERSION  "0.1"
> >> +#define DRIVER_AUTHOR   "a...@ozlabs.ru"
> >> +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
> >> +
> >> +static void tce_iommu_detach_group(void *iommu_data,
> >> +  struct iommu_group *iommu_group);
> >> +
> >> +/*
> >> + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
> >> + *
> >> + * This code handles mapping and unmapping of user data buffers
> >> + * into DMA'ble space using the IOMMU
> >> + */
> >> +
> >> +/*
> >> + * The container descriptor supports only a single group per container.
> >> + * Required by the API as the container is not supplied with the IOMMU 
> >> group
> >> + * at the moment of initialization.
> >&

Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-15 Thread Alex Williamson
On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The address (domain/bus/slot/function) of the passed PCI device
> looks quite different from perspective of host and guest. Some
> architectures like PPC need to setup the mapping in host. The patch
> introduces additional VFIO device IOCTL command to address that.

Could you explain further how this will be used?  How the device is
exposed to a guest is entirely a userspace construct, so why does vfio
need to know or care about this?  I had assumed for AER that QEMU would
do the translation from host to guest address space.

> Signed-off-by: Gavin Shan 
> ---
>  include/uapi/linux/vfio.h |   16 
>  1 files changed, 16 insertions(+), 0 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 6e58d9b..ecc4f38 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -289,6 +289,22 @@ struct vfio_irq_set {
>   */
>  #define VFIO_DEVICE_RESET_IO(VFIO_TYPE, VFIO_BASE + 11)
>  
> +/**
> + * VFIO_DEVICE_SET_ADDR_MAPPING - _IO(VFIO_TYPE, VFIO_BASE + 12)
> + *
> + * The address, which comprised of domain/bus/slot/function looks
> + * different between host and guest. We need to setup the mapping
> + * in host for some architectures like PPC so that the passed PCI
> + * devices could support RTAS smoothly.
> + */
> +struct vfio_addr_mapping {
> + __u64 buid;

What's a buid?  Thanks,

Alex

> + __u8  bus;
> + __u8  slot;
> + __u8  func;
> +};
> +#define VFIO_DEVICE_SET_ADDR_MAPPING _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
>  /*
>   * The VFIO-PCI bus driver makes use of the following fixed region and
>   * IRQ index mapping.  Unimplemented regions return a size of zero.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] VFIO: Direct access config reg without capability

2013-03-15 Thread Alex Williamson
On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The config registers in [0, 0x40] is being supported by VFIO. Apart
> from that, the other config registers should be coverred by PCI or
> PCIe capability. However, there might have some PCI devices (be2net)
> who has config registers (0x7c) out of [0, 0x40], and don't have
> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
> those registers and writing is dropped. It caused the be2net driver
> fails to be loaded because 0x0 returned from its config register 0x7c.
> 
> The patch changes the behaviour so that those config registers out
> of [0, 0x40] and don't have corresponding PCI or PCIe capability
> will be accessed directly.

This basically gives userspace free access to any regions that aren't
covered by known capabilities.  We have no idea what this might expose
on some devices.  I'd like to support be2net, but what's the minimal
access that it needs?  Can we provide 2 or 4 bytes of read-only access
at offset 0x7c for just that device?  Is it always 0x7c?  Let's split
this patch from the series since it's clearly dealing with something
independent.  Thanks,

Alex

> Signed-off-by: Gavin Shan 
> ---
>  drivers/vfio/pci/vfio_pci_config.c |   31 ---
>  1 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c 
> b/drivers/vfio/pci/vfio_pci_config.c
> index 964ff22..5ea3afb 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct 
> vfio_pci_device *vdev, char __user *buf,
>  
>   cap_id = vdev->pci_config_map[*ppos / 4];
>  
> + /*
> +  * Some PCI device config registers might not be coverred by
> +  * capability and useful. We will enable direct access to
> +  * those registers.
> +  */
>   if (cap_id == PCI_CAP_ID_INVALID) {
> - if (iswrite)
> - return ret; /* drop */
> -
> - /*
> -  * Per PCI spec 3.0, section 6.1, reads from reserved and
> -  * unimplemented registers return 0
> -  */
> - if (copy_to_user(buf, &val, count))
> - return -EFAULT;
> -
> - return ret;
> + if (iswrite) {
> + if (copy_from_user(&val, buf, count))
> + return -EFAULT;
> + ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
> +  val, count);
> + return ret ? ret : count;
> + } else {
> + ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
> + &val, count);
> + if (ret)
> + return ret;
> + if (copy_to_user(buf, &val, count))
> + return -EFAULT;
> + return count;
> + }
>   }
>  
>   /*



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-18 Thread Alex Williamson
On Sat, 2013-03-16 at 06:37 +0100, Benjamin Herrenschmidt wrote:
> On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
> > >Could you explain further how this will be used?  How the device is
> > >exposed to a guest is entirely a userspace construct, so why does vfio
> > >need to know or care about this?  I had assumed for AER that QEMU would
> > >do the translation from host to guest address space.
> > >
> > 
> > The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
> > patch. The PowerNV platform is going to override it to figure out the
> > information for EEH core to use. On the other hand, QEMU will runs into
> > the IOCTL command while opening (creating) one VFIO device.
> > 
> > Though I'm not familiar with AER very much. AER is quite different from
> > EEH. The EEH functionality implemented in PHB instead of in PCI device
> > core. So we don't care AER stuff in EEH directly :-)
> 
> To give Alex a bit more background...
> 
> EEH is our IBM specific error handling facility which is a superset of AER.
> 
> IE. In addition to AER's error detection and logging, it adds a layer of
> error detection at the host bridge level (such as iommu violations etc...)
> and a mechanism for handling and recovering from errors. This is tied to
> our iommu domain stuff (our PE's) and our device "freezing" capability
> among others.
> 
> With VFIO + KVM, we want to implement most of the EEH support for guests in
> the host kernel. The reason is multipart and we can discuss this separately
> as some of it might well be debatable (mostly it's more convenient that way
> because we hook into the underlying HW/FW EEH which isn't directly userspace
> accessible so we don't have to add a new layer of kernel -> user API in
> addition to the VFIO stuff), but there's at least one aspect of it that drives
> this requirement more strongly which is performance:
> 
> When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
> a firmware call to query the EEH state of the device and check whether it
> has been frozen. On some devices, that can be a performance issue, and
> going all the way to qemu for that would be horribly expensive.
> 
> So we want at least a way to handle that call in the kernel and for that we
> need at least some way of mapping things there.

There's no notification mechanism when a PHB is frozen?  I suppose
notification would be asynchronous so you risk data for every read that
happens in the interim.  So the choices are a) tell the host kernel the
mapping, b) tell the guest kernel the mapping, c) identity mapping, or
d) qemu intercept?

Presumably your firmware call to query the EEH is not going through
VFIO, so is VFIO the appropriate place to setup this mapping?  As you
say, this seems like just a convenient place to put it even though it
really has nothing to do with the VFIO kernel component.  QEMU has this
information and could register it with the host kernel through other
means if available.  Maybe the mapping should be registered with KVM if
that's how the EEH data is accessed.  I'm not yet sold on why this
mapping is registered here.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] VFIO: Direct access config reg without capability

2013-03-18 Thread Alex Williamson
On Sat, 2013-03-16 at 06:30 +0100, Benjamin Herrenschmidt wrote:
> On Fri, 2013-03-15 at 13:41 -0600, Alex Williamson wrote:
> > 
> > This basically gives userspace free access to any regions that aren't
> > covered by known capabilities. 
> 
> And ?
> 
> I mean seriously :-) We already had that discussion ... trying to
> "protect" config space is just plain doomed. There is no point.
> 
> It makes sense to do things like emulate BARs etc... for things to
> function properly under some circumstances/setups where you can't just
> expose the original BAR values to the guest and have the HW take care of
> it but you *must* be prepared to deal with anything in config space
> being changed without you knowing about it.
> 
> Devices *will* have backdoors via MMIO. Period. You cannot rely on those
> not existing, whether they are documented or not.
> 
> If you can't cope with the config space accesses then you aren't
> properly isolated. It can be deemed acceptable (depends what you use
> your VMs for) but that I mean is that any config space
> filtering/emulation for the sake of "security" is ... pointless.
> 
> Doing it for functionality to work at all (ie BAR emulation) is fine,
> but that's about it. IE. As a mean of security it's pointless.
> 
> 
> >  We have no idea what this might expose on some devices.
> 
> No more than we have any idea what MMIO mapping of the device register
> space exposes :-)

Yeah, yeah.  Ok, I can't come up with a reasonable argument otherwise,
it'll give us better device support, and I believe pci-assign has always
done this.  I'll take another look at the patch.  Thanks,

Alex



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-18 Thread Alex Williamson
On Tue, 2013-03-19 at 11:24 +0800, Gavin Shan wrote:
> On Mon, Mar 18, 2013 at 03:01:14PM -0600, Alex Williamson wrote:
> >On Sat, 2013-03-16 at 06:37 +0100, Benjamin Herrenschmidt wrote:
> >> On Sat, 2013-03-16 at 09:34 +0800, Gavin Shan wrote:
> >> > >Could you explain further how this will be used?  How the device is
> >> > >exposed to a guest is entirely a userspace construct, so why does vfio
> >> > >need to know or care about this?  I had assumed for AER that QEMU would
> >> > >do the translation from host to guest address space.
> >> > >
> >> > 
> >> > The weak IOCTL function (vfio_pci_arch_ioctl) was introduced by previous
> >> > patch. The PowerNV platform is going to override it to figure out the
> >> > information for EEH core to use. On the other hand, QEMU will runs into
> >> > the IOCTL command while opening (creating) one VFIO device.
> >> > 
> >> > Though I'm not familiar with AER very much. AER is quite different from
> >> > EEH. The EEH functionality implemented in PHB instead of in PCI device
> >> > core. So we don't care AER stuff in EEH directly :-)
> >> 
> >> To give Alex a bit more background...
> >> 
> >> EEH is our IBM specific error handling facility which is a superset of AER.
> >> 
> >> IE. In addition to AER's error detection and logging, it adds a layer of
> >> error detection at the host bridge level (such as iommu violations etc...)
> >> and a mechanism for handling and recovering from errors. This is tied to
> >> our iommu domain stuff (our PE's) and our device "freezing" capability
> >> among others.
> >> 
> >> With VFIO + KVM, we want to implement most of the EEH support for guests in
> >> the host kernel. The reason is multipart and we can discuss this separately
> >> as some of it might well be debatable (mostly it's more convenient that way
> >> because we hook into the underlying HW/FW EEH which isn't directly 
> >> userspace
> >> accessible so we don't have to add a new layer of kernel -> user API in
> >> addition to the VFIO stuff), but there's at least one aspect of it that 
> >> drives
> >> this requirement more strongly which is performance:
> >> 
> >> When EEH is enabled, whenever any MMIO returns all 1's, the kernel will do
> >> a firmware call to query the EEH state of the device and check whether it
> >> has been frozen. On some devices, that can be a performance issue, and
> >> going all the way to qemu for that would be horribly expensive.
> >> 
> >> So we want at least a way to handle that call in the kernel and for that we
> >> need at least some way of mapping things there.
> >
> >There's no notification mechanism when a PHB is frozen?  I suppose
> >notification would be asynchronous so you risk data for every read that
> >happens in the interim.  So the choices are a) tell the host kernel the
> >mapping, b) tell the guest kernel the mapping, c) identity mapping, or
> >d) qemu intercept?
> >
> 
> We do have dedicated interrupts on detecting frozen PHB on host side.
> However, the guest has to poll/check the frozen state (frozen PE) during
> access to config or MMIO space.

Can you make use of something like this to notify the guest:

https://github.com/awilliam/linux-vfio/commit/dad9f8972e04cd081a028d8fb1249d746d97fc03

As a first step this only notifies QEMU, but the plan is to forward that
on to the guest.  If we can leverage similar interfaces between AER and
EEH, I'd obviously like to do that.

> For the recommended methods, (a) is what
> we want to do with the patchset. (b) seems infeasible since the guest
> shouldn't be aware of hypervisor (e.g. KVM or PowerVM) it's running on
> top of, it's hard to polish the guest to do it. (d) sounds applicable
> since the QEMU should know the address (BDF) of host and guest devices.
> However, we still need let the host EEH core know that which PCI device
> has been passed to guest and the best place to do that would be when opening
> the corresponding VFIO PCI device. In turn, it will still need weak function
> for ppc platform to override it. Why we not directly take (a) to finish
> everything in one VFIO IOCTL command?

Because it seems like VFIO is just being used as a relay and has no
purpose knowing this information on it's own.  It's just a convenient
place to host the ioctl, but that alone is not a good enough reason to
put it there.

> Sorry, Alex. I didn't understand (c) wel

Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-20 Thread Alex Williamson
On Tue, 2013-03-19 at 05:45 +0100, Benjamin Herrenschmidt wrote:
> On Mon, 2013-03-18 at 22:18 -0600, Alex Williamson wrote:
> > > Yes, EEH firmware call needn't going through VFIO. However, EEH has
> > > very close relationship with PCI and so VFIO-PCI does. Eventually, EEH
> > > has close relationship with VFIO-PCI :-)
> > 
> > Is there some plan to do more with EEH through VFIO in the future or are
> > we just talking about some kind of weird associative property to sell
> > this ioctl?  Thanks,
> 
> Well, I'm not sure how 'weird' that is but it makes sense to me... VFIO
> is the mechanism that virtualizes access to a PCI device and provides
> interfaces to qemu & kvm to access it &| map it.
> 
> Or rather VFIO-PCI is.
> 
> At a basic level it provides ... the basic PCI interfaces, ie, config
> space access (with or without filtering), interrupts, etc...
> 
> In our environment, EEH is just another functionality of PCI really.
> The firmware calls used by the guest to do that fall into more or less
> the same category as the ones used for PCI config space accesses,
> manipulation of DMA windows, etc... Similar to host (though guest
> and host use a different FW interface for various reasons).
> 
> So it's very natural to "transport" these via VFIO-PCI like everything
> else, I don't see a more natural place to put the ioctl's we need for
> qemu to be able to access the EEH state, trigger EEH (un)isolation,
> resets, etc...
> 
> Fundamentally, the design should be for VFIO-PCI to provide some specific
> ioctls for EEH that userspace programs such as qemu can use, and then
> re-expose those APIs to the guest.
> 
> In addition, to do some of it in the kernel for performance reason, we
> want to establish that mapping, but I see that as a VFIO "accelerator".
> 
> IE. Whatever is going to respond to the EEH calls from the guest in-kernel
> will have to share state with the rest of the EEH stuff provided to qemu
> by vfio-pci.

Perhaps my problem is that I don't have a clear picture of where you're
going with this like I do for AER.  For AER we're starting with
notification of an error, from that we build into how to retrieve the
error information, and finally how to perform corrective action.  Each
of these will be done through vifo-pci.

Here we're starting by registering a mapping that's really only useful
to the vfio "accelerator" path, but we don't even have a hint of what
the non-accelerated path is and how vfio is involved with it.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/3] VFIO: VFIO_DEVICE_SET_ADDR_MAPPING command

2013-03-20 Thread Alex Williamson
On Wed, 2013-03-20 at 20:31 +0100, Benjamin Herrenschmidt wrote:
> On Wed, 2013-03-20 at 12:48 -0600, Alex Williamson wrote:
> > Perhaps my problem is that I don't have a clear picture of where
> > you're
> > going with this like I do for AER.  For AER we're starting with
> > notification of an error, from that we build into how to retrieve the
> > error information, and finally how to perform corrective action.  Each
> > of these will be done through vifo-pci.
> > 
> > Here we're starting by registering a mapping that's really only useful
> > to the vfio "accelerator" path, but we don't even have a hint of what
> > the non-accelerated path is and how vfio is involved with it.  Thanks,
> 
> I'm surprised that you are building so much policy around AER ... can't
> you just pass the raw stuff down to the guest and let the guest do it's
> own corrective actions ?

How does the guest get the raw stuff?  We need to get the AER interrupt
out to the guest so it can be injected into the virtual PCIe port, then
we need to be able to retrieve the physical device log and pass it to
the qemu to mangle to match the guest topology.  We don't have existing
firmware interfaces for the guest to do that, so it's all being routed
through vfio-pci.

> As for EEH, I will let Gavin describe in more details what he is doing,
> though I wouldn't be surprised if so far he doesn't have a
> non-accelerated path :-) Which indeed makes things oddball, granted ...
> at least for now. I *think* what Gavin's doing right now is a
> pass-through to the host EEH directly in the kernel, so without a slow
> path...
> 
> Gavin, it really boils down to that. In-kernel EEH for guests is a
> KVMism that ends up not involving VFIO in any other way than
> establishing the mapping, then arguably it could be done via a VM ioctl.
> 
> If there's more going through VFIO and shared state, then it should
> probably go through VFIO-PCI.

Exactly my thinking.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] VFIO: Direct access config reg without capability

2013-03-20 Thread Alex Williamson
On Fri, 2013-03-15 at 15:26 +0800, Gavin Shan wrote:
> The config registers in [0, 0x40] is being supported by VFIO. Apart
> from that, the other config registers should be coverred by PCI or
> PCIe capability. However, there might have some PCI devices (be2net)
> who has config registers (0x7c) out of [0, 0x40], and don't have
> corresponding PCI or PCIe capability. VFIO will return 0x0 on reading
> those registers and writing is dropped. It caused the be2net driver
> fails to be loaded because 0x0 returned from its config register 0x7c.
> 
> The patch changes the behaviour so that those config registers out
> of [0, 0x40] and don't have corresponding PCI or PCIe capability
> will be accessed directly.
> 
> Signed-off-by: Gavin Shan 
> ---

Hi Gavin,

I'm onboard with making this change now, but this patch isn't
sufficient.  The config space map uses a byte per dword to index the
capability since both standard and extended capabilities are dword
aligned.  We currently have a bug that this patch exposes that we round
the length down, ex. a 14 byte MSI capability becomes 12 bytes leaving
the message data now exposed and writable with this patch.  That bug can
be fixed by aligning the length so the capability fills the dword, but
notice that 0x7c on the be2net is filling one of these gaps.  So fixing
that bug attaches that gap to the previous capability instead of
allowing direct access.

So, before we can make this change we need to fix the config map to have
byte granularity.  Thanks,

Alex

>  drivers/vfio/pci/vfio_pci_config.c |   31 ---
>  1 files changed, 20 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_config.c 
> b/drivers/vfio/pci/vfio_pci_config.c
> index 964ff22..5ea3afb 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -1471,18 +1471,27 @@ static ssize_t vfio_config_do_rw(struct 
> vfio_pci_device *vdev, char __user *buf,
>  
>   cap_id = vdev->pci_config_map[*ppos / 4];
>  
> + /*
> +  * Some PCI device config registers might not be coverred by
> +  * capability and useful. We will enable direct access to
> +  * those registers.
> +  */
>   if (cap_id == PCI_CAP_ID_INVALID) {
> - if (iswrite)
> - return ret; /* drop */
> -
> - /*
> -  * Per PCI spec 3.0, section 6.1, reads from reserved and
> -  * unimplemented registers return 0
> -  */
> - if (copy_to_user(buf, &val, count))
> - return -EFAULT;
> -
> - return ret;
> + if (iswrite) {
> + if (copy_from_user(&val, buf, count))
> + return -EFAULT;
> + ret = vfio_user_config_write(vdev->pdev, (int)(*ppos),
> +  val, count);
> + return ret ? ret : count;
> + } else {
> + ret = vfio_user_config_read(vdev->pdev, (int)(*ppos),
> + &val, count);
> + if (ret)
> + return ret;
> + if (copy_to_user(buf, &val, count))
> + return -EFAULT;
> + return count;
> + }
>   }
>  
>   /*



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-04-03 Thread Alex Williamson
On Tue, 2013-04-02 at 18:18 +0200, Joerg Roedel wrote:
> Cc'ing Alex Williamson
> 
> Alex, can you please review the iommu-group part of this patch?

Sure, it looks pretty reasonable.  AIUI, all PCI devices are below some
kind of host bridge that is either new and supports partitioning or old
and doesn't.  I don't know if that's a visibility or isolation
requirement, perhaps PCI ACS-ish.  In the new host bridge case, each
device gets a group.  This seems not to have any quirks for
multifunction devices though.  On AMD and Intel IOMMUs we test
multifunction device ACS support to determine whether all the functions
should be in the same group.  Is there any reason to trust multifunction
devices on PAMU?

I also find it curious what happens to the iommu group of the host
bridge.  In the partitionable case the host bridge group is removed, in
the non-partitionable case the host bridge group becomes the group for
the children, removing the host bridge.  It's unique to PAMU so far that
these host bridges are even in an iommu group (x86 only adds pci
devices), but I don't see it as necessarily wrong leaving it in either
scenario.  Does it solve some problem to remove them from the groups?
Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-04-04 Thread Alex Williamson
On Thu, 2013-04-04 at 13:00 +, Sethi Varun-B16395 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Wednesday, April 03, 2013 11:32 PM
> > To: Joerg Roedel
> > Cc: Sethi Varun-B16395; Yoder Stuart-B08248; Wood Scott-B07421;
> > io...@lists.linux-foundation.org; linuxppc-dev@lists.ozlabs.org; linux-
> > ker...@vger.kernel.org; ga...@kernel.crashing.org;
> > b...@kernel.crashing.org
> > Subject: Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and iommu
> > implementation.
> > 
> > On Tue, 2013-04-02 at 18:18 +0200, Joerg Roedel wrote:
> > > Cc'ing Alex Williamson
> > >
> > > Alex, can you please review the iommu-group part of this patch?
> > 
> > Sure, it looks pretty reasonable.  AIUI, all PCI devices are below some
> > kind of host bridge that is either new and supports partitioning or old
> > and doesn't.  I don't know if that's a visibility or isolation
> > requirement, perhaps PCI ACS-ish.  In the new host bridge case, each
> > device gets a group.  This seems not to have any quirks for multifunction
> > devices though.  On AMD and Intel IOMMUs we test multifunction device ACS
> > support to determine whether all the functions should be in the same
> > group.  Is there any reason to trust multifunction devices on PAMU?
> > 
> [Sethi Varun-B16395] In the case where we can partition endpoints we
> can distinguish transactions based on the bus,device,function number
> combination. This support is available in the PCIe controller (host
> bridge).

So can x86 IOMMUs, that's the visibility aspect of IOMMU groups.
Visibility alone doesn't necessarily imply that a device is isolated
though.  A multifunction PCI device that doesn't expose ACS support may
not isolate functions from each other.  For example a peer-to-peer DMA
between functions may not be translated by the upstream IOMMU.  IOMMU
groups should encompass both visibility and isolation.

> > I also find it curious what happens to the iommu group of the host
> > bridge.  In the partitionable case the host bridge group is removed, in
> > the non-partitionable case the host bridge group becomes the group for
> > the children, removing the host bridge.  It's unique to PAMU so far that
> > these host bridges are even in an iommu group (x86 only adds pci
> > devices), but I don't see it as necessarily wrong leaving it in either
> > scenario.  Does it solve some problem to remove them from the groups?
> > Thanks,
> [Sethi Varun-B16395] The PCIe controller isn't a partitionable entity,
> it would always be owned by the host.

Ownership of a device shouldn't play into the group context.  An IOMMU
group should be defined by it's visibility and isolation from other
devices.  Whether the PCIe controller is allowed to be handed to
userspace is a question for VFIO.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and iommu implementation.

2013-04-04 Thread Alex Williamson
On Thu, 2013-04-04 at 16:35 +, Sethi Varun-B16395 wrote:
> 
> > -Original Message-
> > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > Sent: Thursday, April 04, 2013 8:52 PM
> > To: Sethi Varun-B16395
> > Cc: Joerg Roedel; Yoder Stuart-B08248; Wood Scott-B07421;
> > io...@lists.linux-foundation.org; linuxppc-dev@lists.ozlabs.org; linux-
> > ker...@vger.kernel.org; ga...@kernel.crashing.org;
> > b...@kernel.crashing.org
> > Subject: Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and iommu
> > implementation.
> > 
> > On Thu, 2013-04-04 at 13:00 +, Sethi Varun-B16395 wrote:
> > >
> > > > -Original Message-
> > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > Sent: Wednesday, April 03, 2013 11:32 PM
> > > > To: Joerg Roedel
> > > > Cc: Sethi Varun-B16395; Yoder Stuart-B08248; Wood Scott-B07421;
> > > > io...@lists.linux-foundation.org; linuxppc-dev@lists.ozlabs.org;
> > > > linux- ker...@vger.kernel.org; ga...@kernel.crashing.org;
> > > > b...@kernel.crashing.org
> > > > Subject: Re: [PATCH 5/5 v11] iommu/fsl: Freescale PAMU driver and
> > > > iommu implementation.
> > > >
> > > > On Tue, 2013-04-02 at 18:18 +0200, Joerg Roedel wrote:
> > > > > Cc'ing Alex Williamson
> > > > >
> > > > > Alex, can you please review the iommu-group part of this patch?
> > > >
> > > > Sure, it looks pretty reasonable.  AIUI, all PCI devices are below
> > > > some kind of host bridge that is either new and supports
> > > > partitioning or old and doesn't.  I don't know if that's a
> > > > visibility or isolation requirement, perhaps PCI ACS-ish.  In the
> > > > new host bridge case, each device gets a group.  This seems not to
> > > > have any quirks for multifunction devices though.  On AMD and Intel
> > > > IOMMUs we test multifunction device ACS support to determine whether
> > > > all the functions should be in the same group.  Is there any reason
> > to trust multifunction devices on PAMU?
> > > >
> > > [Sethi Varun-B16395] In the case where we can partition endpoints we
> > > can distinguish transactions based on the bus,device,function number
> > > combination. This support is available in the PCIe controller (host
> > > bridge).
> > 
> > So can x86 IOMMUs, that's the visibility aspect of IOMMU groups.
> > Visibility alone doesn't necessarily imply that a device is isolated
> > though.  A multifunction PCI device that doesn't expose ACS support may
> > not isolate functions from each other.  For example a peer-to-peer DMA
> > between functions may not be translated by the upstream IOMMU.  IOMMU
> > groups should encompass both visibility and isolation.
> [Sethi Varun-B16395] We can isolate the DMA access to the host based
> on the to the pci bus,device,function number.

The IOMMU can only isolate DMA that it can see.  A multifunction device
may never expose peer-to-peer DMA to the upstream device, it's
implementation specific.  The ACS flags allow that possibility to be
controlled and prevented.

> I thought that was enough to put devices in to separate iommu groups.
> This is a PCIe controller property which allows us to partition PCIe
> devices. But, what I can understand from your point is that we also
> need to consider isolation at PCIe device level as well. I will check
> for the case of multifunction devices.
> 
> > 
> > > > I also find it curious what happens to the iommu group of the host
> > > > bridge.  In the partitionable case the host bridge group is removed,
> > > > in the non-partitionable case the host bridge group becomes the
> > > > group for the children, removing the host bridge.  It's unique to
> > > > PAMU so far that these host bridges are even in an iommu group (x86
> > > > only adds pci devices), but I don't see it as necessarily wrong
> > > > leaving it in either scenario.  Does it solve some problem to remove
> > them from the groups?
> > > > Thanks,
> > > [Sethi Varun-B16395] The PCIe controller isn't a partitionable entity,
> > > it would always be owned by the host.
> > 
> > Ownership of a device shouldn't play into the group context.  An IOMMU
> > group should be defined by it's visibility and isolation from other
> > devices.  Whether the PCIe controller is allowed to be handed to
> > userspace is a question for

Re: [PATCH RFC 00/22] EEH Support for VFIO PCI devices on PowerKVM guest

2014-05-05 Thread Alex Williamson
On Mon, 2014-05-05 at 13:56 +0200, Alexander Graf wrote:
> On 05/05/2014 03:27 AM, Gavin Shan wrote:
> > The series of patches intends to support EEH for PCI devices, which have 
> > been
> > passed through to PowerKVM based guest via VFIO. The implementation is
> > straightforward based on the issues or problems we have to resolve to 
> > support
> > EEH for PowerKVM based guest.
> >
> > - Emulation for EEH RTAS requests. Thanksfully, we already have 
> > infrastructure
> >to emulate XICS. Without introducing new mechanism, we just extend that
> >existing infrastructure to support EEH RTAS emulation. EEH RTAS requests
> >initiated from guest are posted to host where the requests get handled or
> >delivered to underly firmware for further handling. For that, the host 
> > kerenl
> >has to maintain the PCI address (host domain/bus/slot/function to guest's
> >PHB BUID/bus/slot/function) mapping via KVM VFIO device. The address 
> > mapping
> >will be built when initializing VFIO device in QEMU and destroied when 
> > the
> >VFIO device in QEMU is going to offline, or VM is destroy.
> 
> Do you also expose all those interfaces to user space? VFIO is as much 
> about user space device drivers as it is about device assignment.
> 
> I would like to first see an implementation that doesn't touch KVM 
> emulation code at all but instead routes everything through QEMU. As a 
> second step we can then accelerate performance critical paths inside of KVM.
> 
> That way we ensure that user space device drivers have all the power 
> over a device they need to drive it.

+1



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/8] drivers/vfio: New IOCTL command VFIO_EEH_INFO

2014-05-19 Thread Alex Williamson
On Wed, 2014-05-14 at 14:11 +1000, Gavin Shan wrote:
> The patch adds new IOCTL command VFIO_EEH_INFO to VFIO container
> to support EEH functionality for PCI devices, which have been
> passed from host to guest via VFIO.

Some comments throughout, but overall this seems to forgo every bit of
the device ownership and protection model used by VFIO and lets the user
pick arbitrary host devices and do various operations, mostly unchecked.
That's not acceptable.

> Signed-off-by: Gavin Shan 
> ---
>  arch/powerpc/platforms/powernv/Makefile   |   1 +
>  arch/powerpc/platforms/powernv/eeh-vfio.c | 593 
> ++
>  drivers/vfio/vfio_iommu_spapr_tce.c   |  12 +
>  include/uapi/linux/vfio.h |  57 +++
>  4 files changed, 663 insertions(+)
>  create mode 100644 arch/powerpc/platforms/powernv/eeh-vfio.c
> 
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index 63cebb9..2b15a03 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -6,5 +6,6 @@ obj-y += opal-msglog.o
>  obj-$(CONFIG_SMP)+= smp.o
>  obj-$(CONFIG_PCI)+= pci.o pci-p5ioc2.o pci-ioda.o
>  obj-$(CONFIG_EEH)+= eeh-ioda.o eeh-powernv.o
> +obj-$(CONFIG_VFIO_EEH)   += eeh-vfio.o
>  obj-$(CONFIG_PPC_SCOM)   += opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
> diff --git a/arch/powerpc/platforms/powernv/eeh-vfio.c 
> b/arch/powerpc/platforms/powernv/eeh-vfio.c
> new file mode 100644
> index 000..69d5f2d
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/eeh-vfio.c
> @@ -0,0 +1,593 @@
> +/*
> +  * The file intends to support EEH funtionality for those PCI devices,
> +  * which have been passed through from host to guest via VFIO. So this
> +  * file is naturally part of VFIO implementation on PowerNV platform.
> +  *
> +  * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014.
> +  *
> +  * This program is free software; you can redistribute it and/or modify
> +  * it under the terms of the GNU General Public License as published by
> +  * the Free Software Foundation; either version 2 of the License, or
> +  * (at your option) any later version.
> +  */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "powernv.h"
> +#include "pci.h"
> +
> +static int powernv_eeh_vfio_map(struct vfio_eeh_info *info)
> +{
> + struct pci_bus *bus, *pe_bus;
> + struct pci_dev *pdev;
> + struct eeh_dev *edev;
> + struct eeh_pe *pe;
> + int domain, bus_no, devfn;
> +
> + /* Host address */
> + domain = info->map.host_domain;
> + bus_no = (info->map.host_cfg_addr >> 8) & 0xff;
> + devfn = info->map.host_cfg_addr & 0xff;

Where are we validating that the user has any legitimate claim to be
touching this device?

> + /* Find PCI bus */
> + bus = pci_find_bus(domain, bus_no);
> + if (!bus) {
> + pr_warn("%s: PCI bus %04x:%02x not found\n",
> + __func__, domain, bus_no);
> + return -ENODEV;
> + }
> +
> + /* Find PCI device */
> + pdev = pci_get_slot(bus, devfn);
> + if (!pdev) {
> + pr_warn("%s: PCI device %04x:%02x:%02x.%01x not found\n",
> + __func__, domain, bus_no,
> + PCI_SLOT(devfn), PCI_FUNC(devfn));
> + return -ENODEV;
> + }
> +
> + /* No EEH device - almost impossible */
> + edev = pci_dev_to_eeh_dev(pdev);
> + if (unlikely(!edev)) {
> + pci_dev_put(pdev);
> + pr_warn("%s: No EEH dev for PCI device %s\n",
> + __func__, pci_name(pdev));
> + return -ENODEV;
> + }
> +
> + /* Doesn't support PE migration between different PHBs */
> + pe = edev->pe;
> + if (!eeh_pe_passed(pe)) {
> + pe_bus = eeh_pe_bus_get(pe);
> + BUG_ON(!pe_bus);

Can a user trigger this maliciously?

> +
> + /* PE# has format 00BBSS00 */
> + pe->guest_addr.buid= info->map.guest_buid;
> + pe->guest_addr.pe_addr = pe_bus->number << 16;
> + eeh_pe_set_passed(pe, true);
> + } else if (pe->guest_addr.buid != info->map.guest_buid) {
> + pci_dev_put(pdev);
> + pr_warn("%s: Mismatched PHB BUID (0x%llx, 0x%llx)\n",
> + __func__, pe->guest_addr.buid, info->map.guest_buid);
> + return -EINVAL;
> + }
> +
> + edev->guest_addr.buid = info->map.guest_buid;
> + edev->guest_addr.config_addr = info->map.guest_cfg_addr;
> + eeh_dev_set_passed(edev, true);
> +
> + pr_debug("EEH: Host PCI dev %s to %llx-%02x:%02x.%01x\n",
> +  pci_name(pdev), info->map.guest_buid,
> +

Re: [PATCH 3/8] drivers/vfio: New IOCTL command VFIO_EEH_INFO

2014-05-19 Thread Alex Williamson
On Tue, 2014-05-20 at 10:22 +1000, Gavin Shan wrote:
> On Mon, May 19, 2014 at 04:33:10PM -0600, Alex Williamson wrote:
> >On Wed, 2014-05-14 at 14:11 +1000, Gavin Shan wrote:
> >> The patch adds new IOCTL command VFIO_EEH_INFO to VFIO container
> >> to support EEH functionality for PCI devices, which have been
> >> passed from host to guest via VFIO.
> 
> Thanks for your comments, Alex.W :-)
> 
> >
> >Some comments throughout, but overall this seems to forgo every bit of
> >the device ownership and protection model used by VFIO and lets the user
> >pick arbitrary host devices and do various operations, mostly unchecked.
> >That's not acceptable.
> >
> 
> As what I replied to patch[2], I'm going to let VFIO-PCI-dev fd handle
> the newly introduced IOCTL command. That way, we should follow the VFIO
> design principles (ownership and protection) because VFIO-PCI-dev fd
> is owned by QEMU process usually.
> 
> Also, the address mapping maintained in EEH will be removed.
> 
> >> Signed-off-by: Gavin Shan 
> >> ---
> >>  arch/powerpc/platforms/powernv/Makefile   |   1 +
> >>  arch/powerpc/platforms/powernv/eeh-vfio.c | 593 
> >> ++
> >>  drivers/vfio/vfio_iommu_spapr_tce.c   |  12 +
> >>  include/uapi/linux/vfio.h |  57 +++
> >>  4 files changed, 663 insertions(+)
> >>  create mode 100644 arch/powerpc/platforms/powernv/eeh-vfio.c
> >> 
> >> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> >> b/arch/powerpc/platforms/powernv/Makefile
> >> index 63cebb9..2b15a03 100644
> >> --- a/arch/powerpc/platforms/powernv/Makefile
> >> +++ b/arch/powerpc/platforms/powernv/Makefile
> >> @@ -6,5 +6,6 @@ obj-y  += opal-msglog.o
> >>  obj-$(CONFIG_SMP) += smp.o
> >>  obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o
> >>  obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o
> >> +obj-$(CONFIG_VFIO_EEH)+= eeh-vfio.o
> >>  obj-$(CONFIG_PPC_SCOM)+= opal-xscom.o
> >>  obj-$(CONFIG_MEMORY_FAILURE)  += opal-memory-errors.o
> >> diff --git a/arch/powerpc/platforms/powernv/eeh-vfio.c 
> >> b/arch/powerpc/platforms/powernv/eeh-vfio.c
> >> new file mode 100644
> >> index 000..69d5f2d
> >> --- /dev/null
> >> +++ b/arch/powerpc/platforms/powernv/eeh-vfio.c
> >> @@ -0,0 +1,593 @@
> >> +/*
> >> +  * The file intends to support EEH funtionality for those PCI devices,
> >> +  * which have been passed through from host to guest via VFIO. So this
> >> +  * file is naturally part of VFIO implementation on PowerNV platform.
> >> +  *
> >> +  * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014.
> >> +  *
> >> +  * This program is free software; you can redistribute it and/or modify
> >> +  * it under the terms of the GNU General Public License as published by
> >> +  * the Free Software Foundation; either version 2 of the License, or
> >> +  * (at your option) any later version.
> >> +  */
> >> +
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +#include 
> >> +
> >> +#include "powernv.h"
> >> +#include "pci.h"
> >> +
> >> +static int powernv_eeh_vfio_map(struct vfio_eeh_info *info)
> >> +{
> >> +  struct pci_bus *bus, *pe_bus;
> >> +  struct pci_dev *pdev;
> >> +  struct eeh_dev *edev;
> >> +  struct eeh_pe *pe;
> >> +  int domain, bus_no, devfn;
> >> +
> >> +  /* Host address */
> >> +  domain = info->map.host_domain;
> >> +  bus_no = (info->map.host_cfg_addr >> 8) & 0xff;
> >> +  devfn = info->map.host_cfg_addr & 0xff;
> >
> >Where are we validating that the user has any legitimate claim to be
> >touching this device?
> >
> 
> I'll let VFIO-PCI-dev fd handle the IOCTL command. With that, we shouldn't
> have the problem.
> 
> >> +  /* Find PCI bus */
> >> +  bus = pci_find_bus(domain, bus_no);
> >> +  if (!bus) {
> >> +  pr_warn("%s: PCI bus %04x:%02x no

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-22 Thread Alex Williamson
On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for VFIO PCI device to support
> EEH functionality for PCI devices, which have been passed through
> from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan 
> ---
>  Documentation/vfio.txt |  88 ++-
>  arch/powerpc/include/asm/eeh.h |  17 +++
>  arch/powerpc/kernel/eeh.c  | 321 
> +
>  drivers/vfio/pci/vfio_pci.c| 131 -
>  include/uapi/linux/vfio.h  |  53 +++
>  5 files changed, 603 insertions(+), 7 deletions(-)

Maybe a chicken and egg problem, but it seems like we could split the
platform code and vfio code into separate patches.

> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..dd13db6 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in 
> real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS 
> services,
> +which works on the basis of additional ioctl commands.
> +
> +So 8 additional ioctls have been added:
>  
>   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   of the DMA window on the PCI bus.
> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
>  
>   VFIO_IOMMU_DISABLE - disables the container.
>  
> + VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
> + specified device. Also, it can be used to remove IO or DMA
> + stopped state on the frozen PE.
> +
> + VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
> + PE or query PE sharing mode.
> +
> + VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> +
> + VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> + error recovering.
> +
> + VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> + one of the major steps for error recoverying.
>  
>  The code flow from the example above should be slightly changed:
>  
> @@ -346,6 +363,75 @@ The code flow from the example above should be slightly 
> changed:
>   ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>   .
>  
> +Based on the initial example we have, the following piece of code could be
> +reference for EEH setup and error handling:
> +
> + struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> + struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
> + struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> + struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> + struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
> +
> + 
> +
> + /* Get a file descriptor for the device */
> + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> +
> + /* Enable the EEH functionality on the device */
> + option.option = EEH_OPT_ENABLE;
> + ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> +
> + /* Retrieve PE address and create and maintain PE by yourself */
> + addr.option = EEH_OPT_GET_PE_ADDR;
> + ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
> +
> + /* Assure EEH is supported on the PE and make PE functional */
> + ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> +
> + /* Save device's state. pci_save_state() would be good enough
> +  * as an example.
> +  */
> +
> + /* Test and setup the device */
> + ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
> +
> + 
> +
> + /* When 0xFF's returned from reading PCI config space or IO BARs
> +  * of the PCI device. Check the PE state to see if that has been
> +  * frozen.
> +  */
> + ioctl(device, VFIO_EEH_PE_GET_STATE, &state);

There's no notification, the user needs to observe the return value an
poll?  Should we be enabling an eventfd to notify the user of the state
change?

> +
> + /* Waiting for pending PCI transactions to be completed and don't
> +  * produce any more PCI traffic from/to the affected PE until
> +  * recovery is finished.
> +  */
> +
> + /* Enable IO for the affected PE and collect logs. Usually, the
> +  * standard part of PCI config space, AER registers are dumped
> +  * as logs for further analysis.
> +  */
> + option.option = EEH_OPT_THAW_MMIO;
> + ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);

How does the guest learn about the error?  Does it need to?
> +
> + /* Issue PE reset */
> + reset.option = EEH_RESET_HOT;
> + ioctl(device, VFIO_EEH_PE_RESET, &reset);
> +
> + /* Configure the PCI bridges for the affected PE */
> + ioctl(device, VFIO_EEH_PE_CONFIGURE

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-22 Thread Alex Williamson
On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>On 22.05.14 10:23, Gavin Shan wrote:
> 
> .../...
> 
> >>>diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>index cb9023d..ef55682 100644
> >>>--- a/include/uapi/linux/vfio.h
> >>>+++ b/include/uapi/linux/vfio.h
> >>>@@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>+/*
> >>>+ * EEH functionality can be enabled or disabled on one specific device.
> >>>+ * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>+ * if required.
> >>>+ */
> >>>+struct vfio_eeh_pe_set_option {
> >>>+  __u32 argsz;
> >>
> >>What is this argsz thing? Is this your way of maintaining backwards
> >>compatibility when we introduce new fields? A new field will change
> >>the ioctl number, so I don't think that makes a lot of sense :).
> >>
> >>Just make the ioctl have a u32 as incoming argument. No fancy
> >>structs, no complicated code.
> >>
> >>The same applies for a number of structs below.
> >>
> >
> >ok. Will do in next revision.
> >
> 
> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
> always has "argsz". I guess it was used as checker by Alex.W. Do you really
> want remove "argsz" ?


IIRC, this was actually a suggestion incorporated from David Gibson, but
using _IO with an argsz and flags field we can maintain compatibility
without bumping the ioctl number.  It really only makes sense if we have
a flags field so we can identify what additional information is being
provided.  Flags can be used as a bitmap of trailing structures or as
revision if we want a set of trailing structures that may change over
time.  Unless you can come up with a good argument against it that would
prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
prefer to keep it.  As I noted in a previous comment, the one ioctl we
have for reset that doesn't take any options is likely going to be the
first ioctl that we need to entirely replace.  If we don't keep argsz,
it seems like we probably need a flags field and reserved structures.

> >>>+  __u32 option;
> >>>+};
> >>>+
> >>>+#define VFIO_EEH_PE_SET_OPTION_IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>+
> >>>+/*
> >>>+ * Each EEH PE should have unique address to be identified. The command
> >>>+ * helps to retrieve the address and the sharing mode of the PE.
> >>>+ */
> >>>+struct vfio_eeh_pe_get_addr {
> >>>+  __u32 argsz;
> >>>+  __u32 option;
> >>>+  __u32 info;
> >>
> >>Any particular reason you need the info field? Can't the return value
> >>of the ioctl hold this? Then you only have a single u32 argument left
> >>to the ioctl again.
> >>
> >
> >ok. Will do in next revision.
> >
> 
> If we eventually remove "argsz" and let ioctl() return value to hold
> information (or negative number for errors), we don't need any data
> struct because the 3rd parameter of ioctl() would be used as input
> and I only need one input parameter. Do you want see this ?
> 
> Hopefully, Alex.W saw this and hasn't objections :)

I'm not sure why we're pushing for the minimal data set to pass to an
ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-23 Thread Alex Williamson
On Fri, 2014-05-23 at 08:52 +0200, Alexander Graf wrote:
> 
> > Am 23.05.2014 um 05:23 schrieb Alex Williamson :
> > 
> >> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> >>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>>> On 22.05.14 10:23, Gavin Shan wrote:
> >> 
> >> .../...
> >> 
> >>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>>> index cb9023d..ef55682 100644
> >>>>> --- a/include/uapi/linux/vfio.h
> >>>>> +++ b/include/uapi/linux/vfio.h
> >>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO_IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>>> +/*
> >>>>> + * EEH functionality can be enabled or disabled on one specific device.
> >>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>>> + * if required.
> >>>>> + */
> >>>>> +struct vfio_eeh_pe_set_option {
> >>>>> +__u32 argsz;
> >>>> 
> >>>> What is this argsz thing? Is this your way of maintaining backwards
> >>>> compatibility when we introduce new fields? A new field will change
> >>>> the ioctl number, so I don't think that makes a lot of sense :).
> >>>> 
> >>>> Just make the ioctl have a u32 as incoming argument. No fancy
> >>>> structs, no complicated code.
> >>>> 
> >>>> The same applies for a number of structs below.
> >>> 
> >>> ok. Will do in next revision.
> >> 
> >> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl command
> >> always has "argsz". I guess it was used as checker by Alex.W. Do you really
> >> want remove "argsz" ?
> > 
> > 
> > IIRC, this was actually a suggestion incorporated from David Gibson, but
> > using _IO with an argsz and flags field we can maintain compatibility
> > without bumping the ioctl number.  It really only makes sense if we have
> > a flags field so we can identify what additional information is being
> > provided.  Flags can be used as a bitmap of trailing structures or as
> > revision if we want a set of trailing structures that may change over
> > time.  Unless you can come up with a good argument against it that would
> > prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
> > prefer to keep it.  As I noted in a previous comment, the one ioctl we
> > have for reset that doesn't take any options is likely going to be the
> > first ioctl that we need to entirely replace.  If we don't keep argsz,
> > it seems like we probably need a flags field and reserved structures.
> > 
> >>>>> +__u32 option;
> >>>>> +};
> >>>>> +
> >>>>> +#define VFIO_EEH_PE_SET_OPTION_IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>>> +
> >>>>> +/*
> >>>>> + * Each EEH PE should have unique address to be identified. The command
> >>>>> + * helps to retrieve the address and the sharing mode of the PE.
> >>>>> + */
> >>>>> +struct vfio_eeh_pe_get_addr {
> >>>>> +__u32 argsz;
> >>>>> +__u32 option;
> >>>>> +__u32 info;
> >>>> 
> >>>> Any particular reason you need the info field? Can't the return value
> >>>> of the ioctl hold this? Then you only have a single u32 argument left
> >>>> to the ioctl again.
> >>> 
> >>> ok. Will do in next revision.
> >> 
> >> If we eventually remove "argsz" and let ioctl() return value to hold
> >> information (or negative number for errors), we don't need any data
> >> struct because the 3rd parameter of ioctl() would be used as input
> >> and I only need one input parameter. Do you want see this ?
> >> 
> >> Hopefully, Alex.W saw this and hasn't objections :)
> > 
> > I'm not sure why we're pushing for the minimal data set to pass to an
> > ioctl.  Seems like a recipe for dead, useless ioctls.  Thanks,
> > 
> 
> The ioctl number includes sizeof(payload). So if a new parameter gets
> added, that would be a different ioctl number.

Not when we use _IO

> If you want to maintain backwards compatibility ioctl number wise in
> the kernel, you'll have to have a "flags" field to indicate whether
> new data is available and a "pad" field, prefarably in a union, that
> ensures the size of the struct doesn't change.
> 
> I'm not sure it's really necessary here to have identical ioctl
> numbers if we add parameters, since we can always just define a new
> ioctl with a bigger payload that can then become the default handler
> and a shim backwards compatible handler with the old number.
> 
> But if you think it is important, let's do it for real, not just
> halfway.
> 
> 
> Alex
> 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-23 Thread Alex Williamson
On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
> >On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> >> The patch adds new IOCTL commands for VFIO PCI device to support
> >> EEH functionality for PCI devices, which have been passed through
> >> from host to somebody else via VFIO.
> >> 
> >> Signed-off-by: Gavin Shan 
> >> ---
> >>  Documentation/vfio.txt |  88 ++-
> >>  arch/powerpc/include/asm/eeh.h |  17 +++
> >>  arch/powerpc/kernel/eeh.c  | 321 
> >> +
> >>  drivers/vfio/pci/vfio_pci.c| 131 -
> >>  include/uapi/linux/vfio.h  |  53 +++
> >>  5 files changed, 603 insertions(+), 7 deletions(-)
> >
> >Maybe a chicken and egg problem, but it seems like we could split the
> >platform code and vfio code into separate patches.
> >
> 
> Ok. I'll keep egg/chicken separated in next revision.
> 
> >> 
> >> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >> index b9ca023..dd13db6 100644
> >> --- a/Documentation/vfio.txt
> >> +++ b/Documentation/vfio.txt
> >> @@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented 
> >> in real mode which provides
> >>  an excellent performance which has limitations such as inability to do
> >>  locked pages accounting in real time.
> >>  
> >> -So 3 additional ioctls have been added:
> >> +4) PPC64 guests detect PCI errors and recover from them via EEH RTAS 
> >> services,
> >> +which works on the basis of additional ioctl commands.
> >> +
> >> +So 8 additional ioctls have been added:
> >>  
> >>VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
> >>of the DMA window on the PCI bus.
> >> @@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
> >>  
> >>VFIO_IOMMU_DISABLE - disables the container.
> >>  
> >> +  VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the
> >> +  specified device. Also, it can be used to remove IO or DMA
> >> +  stopped state on the frozen PE.
> >> +
> >> +  VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
> >> +  PE or query PE sharing mode.
> >> +
> >> +  VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> >> +
> >> +  VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> >> +  error recovering.
> >> +
> >> +  VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> >> +  one of the major steps for error recoverying.
> >>  
> >>  The code flow from the example above should be slightly changed:
> >>  
> >> @@ -346,6 +363,75 @@ The code flow from the example above should be 
> >> slightly changed:
> >>ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> >>.
> >>  
> >> +Based on the initial example we have, the following piece of code could be
> >> +reference for EEH setup and error handling:
> >> +
> >> +  struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> >> +  struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
> >> +  struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> >> +  struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> >> +  struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
> >> +
> >> +  
> >> +
> >> +  /* Get a file descriptor for the device */
> >> +  device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> >> +
> >> +  /* Enable the EEH functionality on the device */
> >> +  option.option = EEH_OPT_ENABLE;
> >> +  ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
> >> +
> >> +  /* Retrieve PE address and create and maintain PE by yourself */
> >> +  addr.option = EEH_OPT_GET_PE_ADDR;
> >> +  ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
> >> +
> >> +  /* Assure EEH is supported on the PE and make PE functional */
> >> +  ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
> >> +
> >> +  /* Save device's state. pci_save_state() would be good enough
> >> +   * as an example.
> >> +   */
> >> +
> >> +  /* Test and setup the device */
> &

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-23 Thread Alex Williamson
On Fri, 2014-05-23 at 15:00 +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> > >There's no notification, the user needs to observe the return value an
> > >poll?  Should we be enabling an eventfd to notify the user of the state
> > >change?
> > >
> > 
> > Yes. The user needs to monitor the return value. we should have one 
> > notification,
> > but it's for later as we discussed :-)
> 
>  ../..
> 
> > >How does the guest learn about the error?  Does it need to?
> > 
> > When guest detects 0xFF's from reading PCI config space or IO, it's going
> > check the device (PE) state. If the device (PE) has been put into frozen
> > state, the recovery will be started.
> 
> Quick recap for Alex W (we discussed that with Alex G).
> 
> While a notification looks like a worthwhile addition in the long run, it
> is not sufficient and not used today and I prefer that we keep that as 
> something
> to add later for those two main reasons:
> 
>  - First, the kernel itself isn't always notified. For example, if we 
> implement
> on top of an RTAS backend (PR KVM under pHyp) or if we are on top of PowerNV 
> but
> the error is a PHB "fence" (the entire PCI Host bridge gets fenced out in 
> hardware
> due to an internal error), then we get no notification. Only polling of the
> hardware or firmware will tell us. Since we don't want to have a polling timer
> in the kernel, that means that the userspace client of VFIO (or alternatively
> the KVM guest) is the one that polls.
> 
>  - Second, this is how our primary user expects it: The primary (and only 
> initial)
> user of this will be qemu/KVM for PAPR guests and they don't have a 
> notification
> mechanism. Instead they query the EEH state after detecting an all 1's return 
> from
> MMIO or config space. This is how PAPR specifies it so we are just 
> implementing the
> spec here :-)
> 
> Because of these, I think we shouldn't worry too much about notification at
> this stage.

Ok, I was asking more about an error log that indicates what error
occurred to freeze the hardware so that the user can make a more
educated guess whether recovery is an option.  Given that you have cases
where there may be no notification and your guest/user already handles
this, the plan to start with polling makes sense.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-23 Thread Alex Williamson
On Fri, 2014-05-23 at 14:30 +0200, Alexander Graf wrote:
> On 23.05.14 13:58, Gavin Shan wrote:
> > On Fri, May 23, 2014 at 08:52:23AM +0200, Alexander Graf wrote:
> >>
> >>> Am 23.05.2014 um 05:23 schrieb Alex Williamson 
> >>> :
> >>>
> >>>> On Fri, 2014-05-23 at 10:37 +1000, Gavin Shan wrote:
> >>>>> On Fri, May 23, 2014 at 10:17:30AM +1000, Gavin Shan wrote:
> >>>>>> On Thu, May 22, 2014 at 11:55:29AM +0200, Alexander Graf wrote:
> >>>>>> On 22.05.14 10:23, Gavin Shan wrote:
> >>>> .../...
> >>>>
> >>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>>>>> index cb9023d..ef55682 100644
> >>>>>>> --- a/include/uapi/linux/vfio.h
> >>>>>>> +++ b/include/uapi/linux/vfio.h
> >>>>>>> @@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
> >>>>>>> #define VFIO_IOMMU_SPAPR_TCE_GET_INFO_IO(VFIO_TYPE, VFIO_BASE + 
> >>>>>>> 12)
> >>>>>>> +/*
> >>>>>>> + * EEH functionality can be enabled or disabled on one specific 
> >>>>>>> device.
> >>>>>>> + * Also, the DMA or IO frozen state can be removed from the frozen PE
> >>>>>>> + * if required.
> >>>>>>> + */
> >>>>>>> +struct vfio_eeh_pe_set_option {
> >>>>>>> +__u32 argsz;
> >>>>>> What is this argsz thing? Is this your way of maintaining backwards
> >>>>>> compatibility when we introduce new fields? A new field will change
> >>>>>> the ioctl number, so I don't think that makes a lot of sense :).
> >>>>>>
> >>>>>> Just make the ioctl have a u32 as incoming argument. No fancy
> >>>>>> structs, no complicated code.
> >>>>>>
> >>>>>> The same applies for a number of structs below.
> >>>>> ok. Will do in next revision.
> >>>> Rechecked include/uapi/linux/vfio.h, the data struct for each ioctl 
> >>>> command
> >>>> always has "argsz". I guess it was used as checker by Alex.W. Do you 
> >>>> really
> >>>> want remove "argsz" ?
> >>>
> >>> IIRC, this was actually a suggestion incorporated from David Gibson, but
> >>> using _IO with an argsz and flags field we can maintain compatibility
> >>> without bumping the ioctl number.  It really only makes sense if we have
> >>> a flags field so we can identify what additional information is being
> >>> provided.  Flags can be used as a bitmap of trailing structures or as
> >>> revision if we want a set of trailing structures that may change over
> >>> time.  Unless you can come up with a good argument against it that would
> >>> prevent us inventing a new ioctl as soon as we need a minor tweak, I'd
> >>> prefer to keep it.  As I noted in a previous comment, the one ioctl we
> >>> have for reset that doesn't take any options is likely going to be the
> >>> first ioctl that we need to entirely replace.  If we don't keep argsz,
> >>> it seems like we probably need a flags field and reserved structures.
> >>>
> >>>>>>> +__u32 option;
> >>>>>>> +};
> >>>>>>> +
> >>>>>>> +#define VFIO_EEH_PE_SET_OPTION_IO(VFIO_TYPE, VFIO_BASE + 21)
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Each EEH PE should have unique address to be identified. The 
> >>>>>>> command
> >>>>>>> + * helps to retrieve the address and the sharing mode of the PE.
> >>>>>>> + */
> >>>>>>> +struct vfio_eeh_pe_get_addr {
> >>>>>>> +__u32 argsz;
> >>>>>>> +__u32 option;
> >>>>>>> +__u32 info;
> >>>>>> Any particular reason you need the info field? Can't the return value
> >>>>>> of the ioctl hold this? Then you only have a single u32 argument left
> >>>>>> to the ioctl again.
> >>>>> ok. Will do in next revision.
> >>>> If we eventually remove "argsz" and let ioctl() return value to hold
> >>>>

Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Sat, 2014-05-24 at 12:06 +1000, Gavin Shan wrote:
> On Fri, May 23, 2014 at 08:29:59AM -0600, Alex Williamson wrote:
> >On Fri, 2014-05-23 at 14:37 +1000, Gavin Shan wrote:
> >> On Thu, May 22, 2014 at 09:10:53PM -0600, Alex Williamson wrote:
> >> >On Thu, 2014-05-22 at 18:23 +1000, Gavin Shan wrote:
> 
> .../...
> 
> >No, sorry, I mean how does the user get information about the error?
> >The interface we have here is:
> >a) find that something bad has happened
> >b) kick it into working again
> >c) continue
> >
> >How does the user figure out what happened and if it makes sense to
> >attempt to recover?  Where does the user learn that their disk is on
> >fire?
> >
> 
> When 0xFF's returned from config or IO read, user should check the
> device (PE)'s state with ioctl command VFIO_EEH_PE_GET_STATE. If the
> device (PE) has been put into "frozen" state, It's confirmed the device
> ("disk" you mentioned) is on fire.

No, this only confirms that something bad happened, not _what_ bad thing
happened.

>  User should kick off recovery, which
> includes:

And here you're just describing the kick operation again...

> 
> - User stops any operatins (config, IO, DMA) on the device because any
>   PCI traffic to "frozen" device will be dropped from software or hardware
>   level. Also, we don't expect DMA traffic during recovery. Otherwise,
>   we will bump into recursive errors and the recovery should fail.
> - VFIO_EEH_PE_SET_OPTION to enable I/O path ("DMA" path is still under frozen
>   state). EEH_VFIO_PE_CONFIGURE to reconfigure affected PCI bridges and then
>   do error log retrieval.

These logs, where do they go?  How does the user get access?  That's
what I'm trying to ask about.

> - VFIO_EEH_PE_RESET to reset the affected device (PE). EEH_VFIO_PE_CONFIUGRE
>   to restore BARs.
> - User resumes the device to start PCI traffic and device is brought to
>   funtional state.
> 
> .../...
> 
> >
> >No, I prefer to stay consistent with the rest of the VFIO API and use
> >argsz + flags.
> >
> 
> Here's the recap for previous reply: I have several cases for ioctl().
> 
> - ioctl(fd, cmd, NULL):   I needn't any input info.
> - ioctl(fd, cmd, &data):  I need input info
> 
> For all the cases, should I simply have a data struct to include 
> "argsz+flags"?

Anything that requires data should have argsz+flags, if it doesn't
require data, it doesn't need them, but think long an hard about whether
there's any possibility that we'll need parameters in the future.

> For return value from ioctl(), can we simply to have additional field in the
> above data struct to carry it? "0" is the information I have to return for
> some of the cases.

If for instance your ioctl is returning something like "number of
errors", then it's perfectly fine to use that as the ioctl return.  <0
is error, >= zero is a success with value.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for sPAPR VFIO container device
> to support EEH functionality for PCI devices, which have been passed
> through from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan 
> ---
>  Documentation/vfio.txt  | 92 
> -
>  drivers/vfio/pci/Makefile   |  1 +
>  drivers/vfio/pci/vfio_pci.c | 20 +---
>  drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
>  drivers/vfio/pci/vfio_pci_private.h |  5 ++
>  drivers/vfio/vfio_iommu_spapr_tce.c | 85 ++
>  include/uapi/linux/vfio.h   | 66 ++
>  7 files changed, 308 insertions(+), 7 deletions(-)
>  create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..d890fed 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
> real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
> +subtree that can be treated as a unit for the purposes of partitioning and
> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
> +function of a multi-function IOA, or multiple IOAs (possibly including switch
> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
> errors
> +and recover from them via EEH RTAS services, which works on the basis of
> +additional ioctl commands.
> +
> +So 7 additional ioctls have been added:
>  
>   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   of the DMA window on the PCI bus.
> @@ -316,6 +324,17 @@ So 3 additional ioctls have been added:
>  
>   VFIO_IOMMU_DISABLE - disables the container.
>  
> + VFIO_EEH_PE_SET_OPTION - enables or disables EEH functionality on the
> + specified device. Also, it can be used to remove IO or DMA
> + stopped state on the frozen PE.
> +
> + VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> +
> + VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> + error recovering.
> +
> + VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> + one of the major steps for error recoverying.
>  
>  The code flow from the example above should be slightly changed:
>  
> @@ -346,6 +365,77 @@ The code flow from the example above should be slightly 
> changed:
>   ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>   .
>  
> +Based on the initial example we have, the following piece of code could be
> +reference for EEH setup and error handling:
> +
> + struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> + struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> + struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> + struct vfio_eeh_pe_configure configure = { .argsz = sizeof(configure) };
> +
> + 
> +
> + /* Get a file descriptor for the device */
> + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> +
> + /* Enable the EEH functionality on the device */
> + option.option = VFIO_EEH_PE_SET_OPT_ENABLE;
> + ioctl(container, VFIO_EEH_PE_SET_OPTION, &option);
> +
> + /* You're suggested to create additional data struct to represent
> +  * PE, and put child devices belonging to same IOMMU group to the
> +  * PE instance for later reference.
> +  */
> +
> + /* Check the PE's state and make sure it's in functional state */
> + ioctl(container, VFIO_EEH_PE_GET_STATE, &state);
> +
> + /* Save device's state. pci_save_state() would be good enough
> +  * as an example.
> +  */
> +
> + /* Test and setup the device */
> + ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
> +
> + 
> +
> + /* When 0xFF's returned from reading PCI config space or IO BARs
> +  * of the PCI device. Check the PE state to see if that has been
> +  * frozen.
> +  */
> + ioctl(container, VFIO_EEH_PE_GET_STATE, &state);
> +
> + /* Waiting for pending PCI transactions to be completed and don't
> +  * produce any more PCI traffic from/to the affected PE until
> +  * recovery is finished.
> +  */
> +
> + /* Enable IO for the affected PE and collect logs. Usually, the
> +  * standard part of PCI config space, AER registers are dumped
> +  * as logs for further analysis.
> +  */
> + option.option = VFIO_EEH_PE_SET_OPT_IO;
> + ioctl(container, VFIO_EEH_PE_SET_OPTION, &option);
> +
> + /* Issue PE reset */
> + reset.option = VFIO_EEH_PE_RESET_HOT;
> + ioctl(con

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 06:30 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2014-05-27 at 12:15 -0600, Alex Williamson wrote:
> 
> > > +/*
> > > + * Reset is the major step to recover problematic PE. The following
> > > + * command helps on that.
> > > + */
> > > +struct vfio_eeh_pe_reset {
> > > + __u32 argsz;
> > > + __u32 flags;
> > > + __u32 option;
> > > +#define VFIO_EEH_PE_RESET_DEACTIVATE 0   /* Deactivate reset 
> > > */
> > > +#define VFIO_EEH_PE_RESET_HOT1   /* Hot reset
> > > */
> > > +#define VFIO_EEH_PE_RESET_FUNDAMENTAL3   /* Fundamental reset
> > > */
> > 
> > How does a user know which of these to use?
> 
> The usual way is the driver asks for one or the other, this plumbs back
> into the guest EEH code which itself plumbs into the PCIe error recovery
> framework in Linux.

So magic?

> 
> However I do have a question for Gavin here: Why do we expose an
> explicit "deactivate" ? The reset functions should do the whole
> reset sequence (assertion, delay, deassertion). In fact the firmware
> doesn't really give you a choice for PERST right ? Or do we have
> a requirement to expose both phases for RTAS? (In that case I'm
> happy to ignore the deassertion there too).
> 
> Cheers,
> Ben.
> 



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:
> On 27.05.14 20:15, Alex Williamson wrote:
> > On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
> >> The patch adds new IOCTL commands for sPAPR VFIO container device
> >> to support EEH functionality for PCI devices, which have been passed
> >> through from host to somebody else via VFIO.
> >>
> >> Signed-off-by: Gavin Shan 
> >> ---
> >>   Documentation/vfio.txt  | 92 
> >> -
> >>   drivers/vfio/pci/Makefile   |  1 +
> >>   drivers/vfio/pci/vfio_pci.c | 20 +---
> >>   drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
> >>   drivers/vfio/pci/vfio_pci_private.h |  5 ++
> >>   drivers/vfio/vfio_iommu_spapr_tce.c | 85 
> >> ++
> >>   include/uapi/linux/vfio.h   | 66 ++
> >>   7 files changed, 308 insertions(+), 7 deletions(-)
> >>   create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
> 
> [...]
> 
> >> +
> >> +  return ret;
> >> +}
> >> +
> >>   static long tce_iommu_ioctl(void *iommu_data,
> >> unsigned int cmd, unsigned long arg)
> >>   {
> >> @@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>tce_iommu_disable(container);
> >>mutex_unlock(&container->lock);
> >>return 0;
> >> +  case VFIO_EEH_PE_SET_OPTION:
> >> +  case VFIO_EEH_PE_GET_STATE:
> >> +  case VFIO_EEH_PE_RESET:
> >> +  case VFIO_EEH_PE_CONFIGURE:
> >> +  return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);
> > This is where it would have really made sense to have a single
> > VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
> > AlexG, are you really attached to splitting these out into separate
> > ioctls?
> 
> I don't see the problem. We need to forward 4 ioctls to a separate piece 
> of code, so we forward 4 ioctls to a separate piece of code :). Putting 
> them into one ioctl just moves the switch() into another function.

And uses an extra 3 ioctl numbers and gives us extra things to update if
we ever need to add more ioctls, etc.  ioctl numbers are an address
space, how much address space do we really want to give to EEH?  It's
not a big difference, but I don't think it's completely even either.
Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-27 Thread Alex Williamson
On Wed, 2014-05-28 at 02:44 +0200, Alexander Graf wrote:
> On 28.05.14 02:39, Alex Williamson wrote:
> > On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:
> >> On 27.05.14 20:15, Alex Williamson wrote:
> >>> On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
> >>>> The patch adds new IOCTL commands for sPAPR VFIO container device
> >>>> to support EEH functionality for PCI devices, which have been passed
> >>>> through from host to somebody else via VFIO.
> >>>>
> >>>> Signed-off-by: Gavin Shan 
> >>>> ---
> >>>>Documentation/vfio.txt  | 92 
> >>>> -
> >>>>drivers/vfio/pci/Makefile   |  1 +
> >>>>drivers/vfio/pci/vfio_pci.c | 20 +---
> >>>>drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
> >>>>drivers/vfio/pci/vfio_pci_private.h |  5 ++
> >>>>drivers/vfio/vfio_iommu_spapr_tce.c | 85 
> >>>> ++
> >>>>include/uapi/linux/vfio.h   | 66 ++
> >>>>7 files changed, 308 insertions(+), 7 deletions(-)
> >>>>create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
> >> [...]
> >>
> >>>> +
> >>>> +return ret;
> >>>> +}
> >>>> +
> >>>>static long tce_iommu_ioctl(void *iommu_data,
> >>>>   unsigned int cmd, unsigned long arg)
> >>>>{
> >>>> @@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>>>  tce_iommu_disable(container);
> >>>>  mutex_unlock(&container->lock);
> >>>>  return 0;
> >>>> +case VFIO_EEH_PE_SET_OPTION:
> >>>> +case VFIO_EEH_PE_GET_STATE:
> >>>> +case VFIO_EEH_PE_RESET:
> >>>> +case VFIO_EEH_PE_CONFIGURE:
> >>>> +return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);
> >>> This is where it would have really made sense to have a single
> >>> VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
> >>> AlexG, are you really attached to splitting these out into separate
> >>> ioctls?
> >> I don't see the problem. We need to forward 4 ioctls to a separate piece
> >> of code, so we forward 4 ioctls to a separate piece of code :). Putting
> >> them into one ioctl just moves the switch() into another function.
> > And uses an extra 3 ioctl numbers and gives us extra things to update if
> > we ever need to add more ioctls, etc.  ioctl numbers are an address
> > space, how much address space do we really want to give to EEH?  It's
> > not a big difference, but I don't think it's completely even either.
> > Thanks,
> 
> Yes, that's the point. I by far prefer to have you push back on anyone 
> who introduces useless ioctls rather than have a separate EEH number 
> space that people can just throw anything in they like ;).

Well, I appreciate that, but having them as separate ioctls doesn't
really prevent that either.  Any one of these 4 could be set to take a
sub-option to extend and contort the EEH interface.  The only way to
prevent that would be to avoid the argsz+flags hack that make the ioctl
extendable.  Thanks,

Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-28 Thread Alex Williamson
On Wed, 2014-05-28 at 13:37 +0200, Alexander Graf wrote:
> On 28.05.14 02:57, Alex Williamson wrote:
> > On Wed, 2014-05-28 at 02:44 +0200, Alexander Graf wrote:
> >> On 28.05.14 02:39, Alex Williamson wrote:
> >>> On Wed, 2014-05-28 at 00:49 +0200, Alexander Graf wrote:
> >>>> On 27.05.14 20:15, Alex Williamson wrote:
> >>>>> On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
> >>>>>> The patch adds new IOCTL commands for sPAPR VFIO container device
> >>>>>> to support EEH functionality for PCI devices, which have been passed
> >>>>>> through from host to somebody else via VFIO.
> >>>>>>
> >>>>>> Signed-off-by: Gavin Shan 
> >>>>>> ---
> >>>>>> Documentation/vfio.txt  | 92 
> >>>>>> -
> >>>>>> drivers/vfio/pci/Makefile   |  1 +
> >>>>>> drivers/vfio/pci/vfio_pci.c | 20 +---
> >>>>>> drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
> >>>>>> drivers/vfio/pci/vfio_pci_private.h |  5 ++
> >>>>>> drivers/vfio/vfio_iommu_spapr_tce.c | 85 
> >>>>>> ++
> >>>>>> include/uapi/linux/vfio.h   | 66 ++
> >>>>>> 7 files changed, 308 insertions(+), 7 deletions(-)
> >>>>>> create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
> >>>> [...]
> >>>>
> >>>>>> +
> >>>>>> +  return ret;
> >>>>>> +}
> >>>>>> +
> >>>>>> static long tce_iommu_ioctl(void *iommu_data,
> >>>>>> unsigned int cmd, unsigned long arg)
> >>>>>> {
> >>>>>> @@ -283,6 +363,11 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>>>>>tce_iommu_disable(container);
> >>>>>>mutex_unlock(&container->lock);
> >>>>>>return 0;
> >>>>>> +  case VFIO_EEH_PE_SET_OPTION:
> >>>>>> +  case VFIO_EEH_PE_GET_STATE:
> >>>>>> +  case VFIO_EEH_PE_RESET:
> >>>>>> +  case VFIO_EEH_PE_CONFIGURE:
> >>>>>> +  return tce_iommu_eeh_ioctl(iommu_data, cmd, arg);
> >>>>> This is where it would have really made sense to have a single
> >>>>> VFIO_EEH_OP ioctl with a data structure passed to indicate the sub-op.
> >>>>> AlexG, are you really attached to splitting these out into separate
> >>>>> ioctls?
> >>>> I don't see the problem. We need to forward 4 ioctls to a separate piece
> >>>> of code, so we forward 4 ioctls to a separate piece of code :). Putting
> >>>> them into one ioctl just moves the switch() into another function.
> >>> And uses an extra 3 ioctl numbers and gives us extra things to update if
> >>> we ever need to add more ioctls, etc.  ioctl numbers are an address
> >>> space, how much address space do we really want to give to EEH?  It's
> >>> not a big difference, but I don't think it's completely even either.
> >>> Thanks,
> >> Yes, that's the point. I by far prefer to have you push back on anyone
> >> who introduces useless ioctls rather than have a separate EEH number
> >> space that people can just throw anything in they like ;).
> > Well, I appreciate that, but having them as separate ioctls doesn't
> > really prevent that either.  Any one of these 4 could be set to take a
> > sub-option to extend and contort the EEH interface.  The only way to
> > prevent that would be to avoid the argsz+flags hack that make the ioctl
> > extendable.  Thanks,
> 
> Sure, that's what patch review is about. I'm really more concerned about 
> whose court the number space is in - you or Gavin. If we're talking 
> about top level ioctls you will care a lot more.
> 
> But I'm not religious about this. You're the VFIO maintainer, so it's 
> your call. I just personally cringe when I see an ioctl that gets an 
> "opcode" and a "parameter" argument where the "parameter" argument is a 
> union with one struct for each opcode.

Well, what would it look like...

struct vfio_eeh_pe_op {
__u32 argsz;
__u32 flags;
__u32 op;
};

Couldn't every single one of these be a separate "op"?  Are there any
cases where we can't use the ioctl return value?

VFIO_EEH_PE_DISABLE
VFIO_EEH_PE_ENABLE
VFIO_EEH_PE_UNFREEZE_IO
VFIO_EEH_PE_UNFREEZE_DMA
VFIO_EEH_PE_GET_MODE
VFIO_EEH_PE_RESET_DEACTIVATE
VFIO_EEH_PE_RESET_HOT
VFIO_EEH_PE_RESET_FUNDAMENTAL
VFIO_EEH_PE_CONFIGURE

It doesn't look that bad to me, what am I missing?  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v7 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-05-28 Thread Alex Williamson
On Wed, 2014-05-28 at 10:55 +1000, Gavin Shan wrote:
> On Tue, May 27, 2014 at 12:15:27PM -0600, Alex Williamson wrote:
> >On Tue, 2014-05-27 at 18:40 +1000, Gavin Shan wrote:
> >> The patch adds new IOCTL commands for sPAPR VFIO container device
> >> to support EEH functionality for PCI devices, which have been passed
> >> through from host to somebody else via VFIO.
> >> 
> >> Signed-off-by: Gavin Shan 
> >> ---
> >>  Documentation/vfio.txt  | 92 
> >> -
> >>  drivers/vfio/pci/Makefile   |  1 +
> >>  drivers/vfio/pci/vfio_pci.c | 20 +---
> >>  drivers/vfio/pci/vfio_pci_eeh.c | 46 +++
> >>  drivers/vfio/pci/vfio_pci_private.h |  5 ++
> >>  drivers/vfio/vfio_iommu_spapr_tce.c | 85 
> >> ++
> >>  include/uapi/linux/vfio.h   | 66 ++
> >>  7 files changed, 308 insertions(+), 7 deletions(-)
> >>  create mode 100644 drivers/vfio/pci/vfio_pci_eeh.c
> >> 
> >> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >> index b9ca023..d890fed 100644
> >> --- a/Documentation/vfio.txt
> >> +++ b/Documentation/vfio.txt
> >> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented 
> >> in real mode which provides
> >>  an excellent performance which has limitations such as inability to do
> >>  locked pages accounting in real time.
> >>  
> >> -So 3 additional ioctls have been added:
> >> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an 
> >> I/O
> >> +subtree that can be treated as a unit for the purposes of partitioning and
> >> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
> >> +function of a multi-function IOA, or multiple IOAs (possibly including 
> >> switch
> >> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
> >> errors
> >> +and recover from them via EEH RTAS services, which works on the basis of
> >> +additional ioctl commands.
> >> +
> >> +So 7 additional ioctls have been added:
> >>  
> >>VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
> >>of the DMA window on the PCI bus.
> >> @@ -316,6 +324,17 @@ So 3 additional ioctls have been added:
> >>  
> >>VFIO_IOMMU_DISABLE - disables the container.
> >>  
> >> +  VFIO_EEH_PE_SET_OPTION - enables or disables EEH functionality on the
> >> +  specified device. Also, it can be used to remove IO or DMA
> >> +  stopped state on the frozen PE.
> >> +
> >> +  VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
> >> +
> >> +  VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
> >> +  error recovering.
> >> +
> >> +  VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
> >> +  one of the major steps for error recoverying.
> >>  
> >>  The code flow from the example above should be slightly changed:
> >>  
> >> @@ -346,6 +365,77 @@ The code flow from the example above should be 
> >> slightly changed:
> >>ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> >>.
> >>  
> >> +Based on the initial example we have, the following piece of code could be
> >> +reference for EEH setup and error handling:
> >> +
> >> +  struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
> >> +  struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
> >> +  struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
> >> +  struct vfio_eeh_pe_configure configure = { .argsz = sizeof(configure) };
> >> +
> >> +  
> >> +
> >> +  /* Get a file descriptor for the device */
> >> +  device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> >> +
> >> +  /* Enable the EEH functionality on the device */
> >> +  option.option = VFIO_EEH_PE_SET_OPT_ENABLE;
> >> +  ioctl(container, VFIO_EEH_PE_SET_OPTION, &option);
> >> +
> >> +  /* You're suggested to create additional data struct to represent
> >> +   * PE, and put child devices belonging to same IOMMU group to the
> >> +   * PE instance for later reference.
> >> +   */
> >> +
> >> +  /* Check the PE's

Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Alex Williamson
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for sPAPR VFIO container device
> to support EEH functionality for PCI devices, which have been passed
> through from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan 
> ---
>  Documentation/vfio.txt  | 87 ++--
>  drivers/vfio/Makefile   |  1 +
>  drivers/vfio/pci/vfio_pci.c | 20 ++---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
>  drivers/vfio/vfio_spapr_eeh.c   | 89 
> +
>  include/linux/vfio.h| 23 ++
>  include/uapi/linux/vfio.h   | 35 +++
>  7 files changed, 262 insertions(+), 10 deletions(-)
>  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..3fa4538 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
> real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
> +subtree that can be treated as a unit for the purposes of partitioning and
> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
> +function of a multi-function IOA, or multiple IOAs (possibly including switch
> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
> errors
> +and recover from them via EEH RTAS services, which works on the basis of
> +additional ioctl commands.
> +
> +So 4 additional ioctls have been added:
>  
>   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   of the DMA window on the PCI bus.
> @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
>  
>   VFIO_IOMMU_DISABLE - disables the container.
>  
> + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
> recovery.
>  
>  The code flow from the example above should be slightly changed:
>  
> + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
> +
>   .
>   /* Add the group to the container */
>   ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
> @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
> changed:
>   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
>  
>   /* Check here is .iova/.size are within DMA window from 
> spapr_iommu_info */
> -
>   ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> - .
> +
> + /* Get a file descriptor for the device */
> + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> +
> + 
> +
> + /* Gratuitous device reset and go... */
> + ioctl(device, VFIO_DEVICE_RESET);
> +
> + /* Make sure EEH is supported */
> + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
> +
> + /* Enable the EEH functionality on the device */
> + pe_op.op = VFIO_EEH_PE_ENABLE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* You're suggested to create additional data struct to represent
> +  * PE, and put child devices belonging to same IOMMU group to the
> +  * PE instance for later reference.
> +  */
> +
> + /* Check the PE's state and make sure it's in functional state */
> + pe_op.op = VFIO_EEH_PE_GET_STATE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Save device state using pci_save_state().
> +  * EEH should be enabled on the specified device.
> +  */
> +
> + 
> +
> + /* When 0xFF's returned from reading PCI config space or IO BARs
> +  * of the PCI device. Check the PE's state to see if that has been
> +  * frozen.
> +  */
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Waiting for pending PCI transactions to be completed and don't
> +  * produce any more PCI traffic from/to the affected PE until
> +  * recovery is finished.
> +  */
> +
> + /* Enable IO for the affected PE and collect logs. Usually, the
> +  * standard part of PCI config space, AER registers are dumped
> +  * as logs for further analysis.
> +  */
> + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /*
> +  * Issue PE reset: hot or fundamental reset. Usually, hot reset
> +  * is enough. However, the firmware of some PCI adapters would
> +  * require fundamental reset.
> +  */
> + pe_op.op = VFIO_EEH_PE_RESET_HOT;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Configure the PCI bridges for the affected PE */
> + pe_op.op = VFIO_EEH_PE_CONFIGURE;
> + ioctl(container, VF

  1   2   3   4   >