I have tested the non-cxl specific parts
(mm_context_add_copro/mm_context_remove_copro) with this series -
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=1681 - and it
works well for npu.

Tested-by: Alistair Popple <alist...@popple.id.au>

On Sun, 3 Sep 2017 08:15:13 PM Frederic Barrat wrote:
> The PSL and nMMU need to see all TLB invalidations for the memory
> contexts used on the adapter. For the hash memory model, it is done by
> making all TLBIs global as soon as the cxl driver is in use. For
> radix, we need something similar, but we can refine and only convert
> to global the invalidations for contexts actually used by the device.
> 
> The new mm_context_add_copro() API increments the 'active_cpus' count
> for the contexts attached to the cxl adapter. As soon as there's more
> than 1 active cpu, the TLBIs for the context become global. Active cpu
> count must be decremented when detaching to restore locality if
> possible and to avoid overflowing the counter.
> 
> The hash memory model support is somewhat limited, as we can't
> decrement the active cpus count when mm_context_remove_copro() is
> called, because we can't flush the TLB for a mm on hash. So TLBIs
> remain global on hash.
> 
> Signed-off-by: Frederic Barrat <fbar...@linux.vnet.ibm.com>
> Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
> ---
> Changelog:
> v3: don't decrement active cpus count with hash, as we don't know how to flush
> v2: Replace flush_tlb_mm() by the new flush_all_mm() to flush the TLBs
> and PWCs (thanks to Ben)
> 
>  arch/powerpc/include/asm/mmu_context.h | 46 
> ++++++++++++++++++++++++++++++++++
>  arch/powerpc/mm/mmu_context.c          |  9 -------
>  drivers/misc/cxl/api.c                 | 22 +++++++++++++---
>  drivers/misc/cxl/context.c             |  3 +++
>  drivers/misc/cxl/file.c                | 19 ++++++++++++--
>  5 files changed, 85 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 309592589e30..a0d7145d6cd2 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next);
>  extern int use_cop(unsigned long acop, struct mm_struct *mm);
>  extern void drop_cop(unsigned long acop, struct mm_struct *mm);
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +static inline void inc_mm_active_cpus(struct mm_struct *mm)
> +{
> +     atomic_inc(&mm->context.active_cpus);
> +}
> +
> +static inline void dec_mm_active_cpus(struct mm_struct *mm)
> +{
> +     atomic_dec(&mm->context.active_cpus);
> +}
> +
> +static inline void mm_context_add_copro(struct mm_struct *mm)
> +{
> +     /*
> +      * On hash, should only be called once over the lifetime of
> +      * the context, as we can't decrement the active cpus count
> +      * and flush properly for the time being.
> +      */
> +     inc_mm_active_cpus(mm);
> +}
> +
> +static inline void mm_context_remove_copro(struct mm_struct *mm)
> +{
> +     /*
> +      * Need to broadcast a global flush of the full mm before
> +      * decrementing active_cpus count, as the next TLBI may be
> +      * local and the nMMU and/or PSL need to be cleaned up.
> +      * Should be rare enough so that it's acceptable.
> +      *
> +      * Skip on hash, as we don't know how to do the proper flush
> +      * for the time being. Invalidations will remain global if
> +      * used on hash.
> +      */
> +     if (radix_enabled()) {
> +             flush_all_mm(mm);
> +             dec_mm_active_cpus(mm);
> +     }
> +}
> +#else
> +static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
> +static inline void dec_mm_active_cpus(struct mm_struct *mm) { }
> +static inline void mm_context_add_copro(struct mm_struct *mm) { }
> +static inline void mm_context_remove_copro(struct mm_struct *mm) { }
> +#endif
> +
> +
>  extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct 
> *next,
>                              struct task_struct *tsk);
>  
> diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
> index 0f613bc63c50..d60a62bf4fc7 100644
> --- a/arch/powerpc/mm/mmu_context.c
> +++ b/arch/powerpc/mm/mmu_context.c
> @@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
>                                  struct mm_struct *mm) { }
>  #endif
>  
> -#ifdef CONFIG_PPC_BOOK3S_64
> -static inline void inc_mm_active_cpus(struct mm_struct *mm)
> -{
> -     atomic_inc(&mm->context.active_cpus);
> -}
> -#else
> -static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
> -#endif
> -
>  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>                       struct task_struct *tsk)
>  {
> diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
> index a0c44d16bf30..1137a2cc1d3e 100644
> --- a/drivers/misc/cxl/api.c
> +++ b/drivers/misc/cxl/api.c
> @@ -15,6 +15,7 @@
>  #include <linux/module.h>
>  #include <linux/mount.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  
>  #include "cxl.h"
>  
> @@ -331,9 +332,12 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
>               /* ensure this mm_struct can't be freed */
>               cxl_context_mm_count_get(ctx);
>  
> -             /* decrement the use count */
> -             if (ctx->mm)
> +             if (ctx->mm) {
> +                     /* decrement the use count from above */
>                       mmput(ctx->mm);
> +                     /* make TLBIs for this context global */
> +                     mm_context_add_copro(ctx->mm);
> +             }
>       }
>  
>       /*
> @@ -342,13 +346,25 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
>        */
>       cxl_ctx_get();
>  
> +     /*
> +      * Barrier is needed to make sure all TLBIs are global before
> +      * we attach and the context starts being used by the adapter.
> +      *
> +      * Needed after mm_context_add_copro() for radix and
> +      * cxl_ctx_get() for hash/p8
> +      */
> +     smp_mb();
> +
>       if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
>               put_pid(ctx->pid);
>               ctx->pid = NULL;
>               cxl_adapter_context_put(ctx->afu->adapter);
>               cxl_ctx_put();
> -             if (task)
> +             if (task) {
>                       cxl_context_mm_count_put(ctx);
> +                     if (ctx->mm)
> +                             mm_context_remove_copro(ctx->mm);
> +             }
>               goto out;
>       }
>  
> diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
> index 8c32040b9c09..12a41b2753f0 100644
> --- a/drivers/misc/cxl/context.c
> +++ b/drivers/misc/cxl/context.c
> @@ -18,6 +18,7 @@
>  #include <linux/slab.h>
>  #include <linux/idr.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  #include <asm/cputable.h>
>  #include <asm/current.h>
>  #include <asm/copro.h>
> @@ -267,6 +268,8 @@ int __detach_context(struct cxl_context *ctx)
>  
>       /* Decrease the mm count on the context */
>       cxl_context_mm_count_put(ctx);
> +     if (ctx->mm)
> +             mm_context_remove_copro(ctx->mm);
>       ctx->mm = NULL;
>  
>       return 0;
> diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
> index 4bfad9f6dc9f..84b801b5d0e5 100644
> --- a/drivers/misc/cxl/file.c
> +++ b/drivers/misc/cxl/file.c
> @@ -19,6 +19,7 @@
>  #include <linux/mm.h>
>  #include <linux/slab.h>
>  #include <linux/sched/mm.h>
> +#include <linux/mmu_context.h>
>  #include <asm/cputable.h>
>  #include <asm/current.h>
>  #include <asm/copro.h>
> @@ -220,9 +221,12 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>       /* ensure this mm_struct can't be freed */
>       cxl_context_mm_count_get(ctx);
>  
> -     /* decrement the use count */
> -     if (ctx->mm)
> +     if (ctx->mm) {
> +             /* decrement the use count from above */
>               mmput(ctx->mm);
> +             /* make TLBIs for this context global */
> +             mm_context_add_copro(ctx->mm);
> +     }
>  
>       /*
>        * Increment driver use count. Enables global TLBIs for hash
> @@ -230,6 +234,15 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>        */
>       cxl_ctx_get();
>  
> +     /*
> +      * Barrier is needed to make sure all TLBIs are global before
> +      * we attach and the context starts being used by the adapter.
> +      *
> +      * Needed after mm_context_add_copro() for radix and
> +      * cxl_ctx_get() for hash/p8
> +      */
> +     smp_mb();
> +
>       trace_cxl_attach(ctx, work.work_element_descriptor, 
> work.num_interrupts, amr);
>  
>       if ((rc = cxl_ops->attach_process(ctx, false, 
> work.work_element_descriptor,
> @@ -240,6 +253,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
>               ctx->pid = NULL;
>               cxl_ctx_put();
>               cxl_context_mm_count_put(ctx);
> +             if (ctx->mm)
> +                     mm_context_remove_copro(ctx->mm);
>               goto out;
>       }
>  
> 

Reply via email to