radix: introduce options to disable use of the tlbie instruction

Alistair Popple Mon, 02 Sep 2019 17:34:24 -0700

Nick,

On Tuesday, 3 September 2019 1:29:31 AM AEST Nicholas Piggin wrote:
> Introduce two options to control the use of the tlbie instruction. A
> boot time option which completely disables the kernel using the
> instruction, this is currently incompatible with HASH MMU, KVM, and
> coherent accelerators.


Some accelerators (eg. cxl, ocxl, npu) call mm_context_add_copro() to force 
global TLB invalidations:

static inline void mm_context_add_copro(struct mm_struct *mm)
{
        /*
         * If any copro is in use, increment the active CPU count
         * in order to force TLB invalidations to be global as to
         * propagate to the Nest MMU.
         */
        if (atomic_inc_return(&mm->context.copros) == 1)
                inc_mm_active_cpus(mm);
}

Admittedly I haven't dug into all the details of this patch but it sounds like 
it might break the above if TLBIE is disabled. Do you think we should add a 
WARN_ON if mm_context_add_copro() is called with TLBIE disabled? Or perhaps 
even force TLBIE to be re-enabled if it is called with it disabled?

- Alistair

> And a debugfs option can be switched at runtime and avoids using tlbie
> for invalidating CPU TLBs for normal process and kernel address
> mappings. Coherent accelerators are still managed with tlbie, as will
> KVM partition scope translations.
> 
> Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a
> basic implementation which does not attempt to make any optimisation
> beyond the tlbie implementation.
> 
> This is useful for performance testing among other things. For example
> in certain situations on large systems, using IPIs may be faster than
> tlbie as they can be directed rather than broadcast. Later we may also
> take advantage of the IPIs to do more interesting things such as trim
> the mm cpumask more aggressively.
> 
> Signed-off-by: Nicholas Piggin <npig...@gmail.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |   4 +
>  arch/powerpc/include/asm/book3s/64/tlbflush.h |   9 +
>  arch/powerpc/kvm/book3s_hv.c                  |   6 +
>  arch/powerpc/mm/book3s64/pgtable.c            |  47 +++++
>  arch/powerpc/mm/book3s64/radix_tlb.c          | 190 ++++++++++++++++--
>  drivers/misc/cxl/main.c                       |   4 +
>  drivers/misc/ocxl/main.c                      |   4 +
>  7 files changed, 246 insertions(+), 18 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/
admin-guide/kernel-parameters.txt
> index d3cbb3ae62b6..65ae16549aa3 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -860,6 +860,10 @@
>       disable_radix   [PPC]
>                       Disable RADIX MMU mode on POWER9
>  
> +     disable_tlbie   [PPC]
> +                     Disable TLBIE instruction. Currently does not work
> +                     with KVM, with HASH MMU, or with coherent accelerators.
> +
>       disable_cpu_apicid= [X86,APIC,SMP]
>                       Format: <int>
>                       The number of initial APIC ID for the
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/
include/asm/book3s/64/tlbflush.h
> index ebf572ea621e..7aa8195b6cff 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather 
*tlb, unsigned long addre
>  
>       radix__flush_tlb_pwc(tlb, address);
>  }
> +
> +extern bool tlbie_capable;
> +extern bool tlbie_enabled;
> +
> +static inline bool cputlb_use_tlbie(void)
> +{
> +     return tlbie_enabled;
> +}
> +
>  #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index cde3f5a4b3e4..3cdaa2a09a19 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void)
>  static int kvmppc_book3s_init_hv(void)
>  {
>       int r;
> +
> +     if (!tlbie_capable) {
> +             pr_err("KVM-HV: Host does not support TLBIE\n");
> +             return -ENODEV;
> +     }
> +
>       /*
>        * FIXME!! Do we need to check on all cpus ?
>        */
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/
pgtable.c
> index 351eb78eed55..75483b40fcb1 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -8,6 +8,7 @@
>  #include <linux/memblock.h>
>  #include <misc/cxl-base.h>
>  
> +#include <asm/debugfs.h>
>  #include <asm/pgalloc.h>
>  #include <asm/tlb.h>
>  #include <asm/trace.h>
> @@ -469,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock 
*new_pmd_ptl,
>  
>       return true;
>  }
> +
> +/*
> + * Does the CPU support tlbie?
> + */
> +bool tlbie_capable __read_mostly = true;
> +EXPORT_SYMBOL(tlbie_capable);
> +
> +/*
> + * Should tlbie be used for management of CPU TLBs, for kernel and process
> + * address spaces? tlbie may still be used for nMMU accelerators, and for 
KVM
> + * guest address spaces.
> + */
> +bool tlbie_enabled __read_mostly = true;
> +
> +static int __init setup_disable_tlbie(char *str)
> +{
> +     if (!radix_enabled()) {
> +             pr_err("disable_tlbie: Unable to disable TLBIE with Hash 
> MMU.\n");
> +             return 1;
> +     }
> +
> +     tlbie_capable = false;
> +     tlbie_enabled = false;
> +
> +        return 1;
> +}
> +__setup("disable_tlbie", setup_disable_tlbie);
> +
> +static int __init pgtable_debugfs_setup(void)
> +{
> +     if (!tlbie_capable)
> +             return 0;
> +
> +     /*
> +      * There is no locking vs tlb flushing when changing this value.
> +      * The tlb flushers will see one value or another, and use either
> +      * tlbie or tlbiel with IPIs. In both cases the TLBs will be
> +      * invalidated as expected.
> +      */
> +     debugfs_create_bool("tlbie_enabled", 0600,
> +                     powerpc_debugfs_root,
> +                     &tlbie_enabled);
> +
> +     return 0;
> +}
> +arch_initcall(pgtable_debugfs_setup);
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/
radix_tlb.c
> index f9cf8ae59831..631be42abd33 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -270,6 +270,39 @@ static inline void _tlbie_pid(unsigned long pid, 
unsigned long ric)
>       asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +struct tlbiel_pid {
> +     unsigned long pid;
> +     unsigned long ric;
> +};
> +
> +static void do_tlbiel_pid(void *info)
> +{
> +     struct tlbiel_pid *t = info;
> +
> +     if (t->ric == RIC_FLUSH_TLB)
> +             _tlbiel_pid(t->pid, RIC_FLUSH_TLB);
> +     else if (t->ric == RIC_FLUSH_PWC)
> +             _tlbiel_pid(t->pid, RIC_FLUSH_PWC);
> +     else
> +             _tlbiel_pid(t->pid, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
> +                             unsigned long pid, unsigned long ric)
> +{
> +     struct cpumask *cpus = mm_cpumask(mm);
> +     struct tlbiel_pid t = { .pid = pid, .ric = ric };
> +
> +     on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
> +     /*
> +      * Always want the CPU translations to be invalidated with tlbiel in
> +      * these paths, so while coprocessors must use tlbie, we can not
> +      * optimise away the tlbiel component.
> +      */
> +     if (atomic_read(&mm->context.copros) > 0)
> +             _tlbie_pid(pid, RIC_FLUSH_ALL);
> +}
> +
>  static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
>  {
>       asm volatile("ptesync": : :"memory");
> @@ -370,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, 
unsigned long pid,
>       asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +struct tlbiel_va {
> +     unsigned long pid;
> +     unsigned long va;
> +     unsigned long psize;
> +     unsigned long ric;
> +};
> +
> +static void do_tlbiel_va(void *info)
> +{
> +     struct tlbiel_va *t = info;
> +
> +     if (t->ric == RIC_FLUSH_TLB)
> +             _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
> +     else if (t->ric == RIC_FLUSH_PWC)
> +             _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
> +     else
> +             _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_va_multicast(struct mm_struct *mm,
> +                             unsigned long va, unsigned long pid,
> +                             unsigned long psize, unsigned long ric)
> +{
> +     struct cpumask *cpus = mm_cpumask(mm);
> +     struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric 
> };
> +     on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
> +     if (atomic_read(&mm->context.copros) > 0)
> +             _tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
> +}
> +
> +struct tlbiel_va_range {
> +     unsigned long pid;
> +     unsigned long start;
> +     unsigned long end;
> +     unsigned long page_size;
> +     unsigned long psize;
> +     bool also_pwc;
> +};
> +
> +static void do_tlbiel_va_range(void *info)
> +{
> +     struct tlbiel_va_range *t = info;
> +
> +     _tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
> +                                 t->psize, t->also_pwc);
> +}
> +
>  static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long 
lpid,
>                             unsigned long psize, unsigned long ric)
>  {
> @@ -393,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, 
unsigned long end,
>       asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
> +                             unsigned long start, unsigned long end,
> +                             unsigned long pid, unsigned long page_size,
> +                             unsigned long psize, bool also_pwc)
> +{
> +     struct cpumask *cpus = mm_cpumask(mm);
> +     struct tlbiel_va_range t = { .start = start, .end = end,
> +                             .pid = pid, .page_size = page_size,
> +                             .psize = psize, .also_pwc = also_pwc };
> +
> +     on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
> +     if (atomic_read(&mm->context.copros) > 0)
> +             _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
> +}
> +
>  /*
>   * Base TLB flushing operations:
>   *
> @@ -530,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
>                       goto local;
>               }
>  
> -             if (mm_needs_flush_escalation(mm))
> -                     _tlbie_pid(pid, RIC_FLUSH_ALL);
> -             else
> -                     _tlbie_pid(pid, RIC_FLUSH_TLB);
> +             if (cputlb_use_tlbie()) {
> +                     if (mm_needs_flush_escalation(mm))
> +                             _tlbie_pid(pid, RIC_FLUSH_ALL);
> +                     else
> +                             _tlbie_pid(pid, RIC_FLUSH_TLB);
> +             } else {
> +                     _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
> +             }
>       } else {
>  local:
>               _tlbiel_pid(pid, RIC_FLUSH_TLB);
> @@ -559,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool 
fullmm)
>                               goto local;
>                       }
>               }
> -             _tlbie_pid(pid, RIC_FLUSH_ALL);
> +             if (cputlb_use_tlbie())
> +                     _tlbie_pid(pid, RIC_FLUSH_ALL);
> +             else
> +                     _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
>       } else {
>  local:
>               _tlbiel_pid(pid, RIC_FLUSH_ALL);
> @@ -594,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
>                       exit_flush_lazy_tlbs(mm);
>                       goto local;
>               }
> -             _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> +             if (cputlb_use_tlbie())
> +                     _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> +             else
> +                     _tlbiel_va_multicast(mm, vmaddr, pid, psize, 
> RIC_FLUSH_TLB);
>       } else {
>  local:
>               _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
> @@ -616,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
>  #define radix__flush_all_mm radix__local_flush_all_mm
>  #endif /* CONFIG_SMP */
>  
> +static void do_tlbiel_kernel(void *info)
> +{
> +     _tlbiel_pid(0, RIC_FLUSH_ALL);
> +}
> +
> +static inline void _tlbiel_kernel_broadcast(void)
> +{
> +     on_each_cpu(do_tlbiel_kernel, NULL, 1);
> +     if (tlbie_capable) {
> +             /*
> +              * Coherent accelerators don't refcount kernel memory mappings,
> +              * so have to always issue a tlbie for them. This is quite a
> +              * slow path anyway.
> +              */
> +             _tlbie_pid(0, RIC_FLUSH_ALL);
> +     }
> +}
> +
>  /*
>   * If kernel TLBIs ever become local rather than global, then
>   * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
> @@ -623,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
>   */
>  void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
> -     _tlbie_pid(0, RIC_FLUSH_ALL);
> +     if (cputlb_use_tlbie())
> +             _tlbie_pid(0, RIC_FLUSH_ALL);
> +     else
> +             _tlbiel_kernel_broadcast();
>  }
>  EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>  
> @@ -679,10 +805,14 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>               if (local) {
>                       _tlbiel_pid(pid, RIC_FLUSH_TLB);
>               } else {
> -                     if (mm_needs_flush_escalation(mm))
> -                             _tlbie_pid(pid, RIC_FLUSH_ALL);
> -                     else
> -                             _tlbie_pid(pid, RIC_FLUSH_TLB);
> +                     if (cputlb_use_tlbie()) {
> +                             if (mm_needs_flush_escalation(mm))
> +                                     _tlbie_pid(pid, RIC_FLUSH_ALL);
> +                             else
> +                                     _tlbie_pid(pid, RIC_FLUSH_TLB);
> +                     } else {
> +                             _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
> +                     }
>               }
>       } else {
>               bool hflush = flush_all_sizes;
> @@ -707,8 +837,8 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>                               gflush = false;
>               }
>  
> -             asm volatile("ptesync": : :"memory");
>               if (local) {
> +                     asm volatile("ptesync": : :"memory");
>                       __tlbiel_va_range(start, end, pid, page_size, 
> mmu_virtual_psize);
>                       if (hflush)
>                               __tlbiel_va_range(hstart, hend, pid,
> @@ -717,7 +847,8 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>                               __tlbiel_va_range(gstart, gend, pid,
>                                               PUD_SIZE, MMU_PAGE_1G);
>                       asm volatile("ptesync": : :"memory");
> -             } else {
> +             } else if (cputlb_use_tlbie()) {
> +                     asm volatile("ptesync": : :"memory");
>                       __tlbie_va_range(start, end, pid, page_size, 
> mmu_virtual_psize);
>                       if (hflush)
>                               __tlbie_va_range(hstart, hend, pid,
> @@ -727,6 +858,15 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>                                               PUD_SIZE, MMU_PAGE_1G);
>                       fixup_tlbie();
>                       asm volatile("eieio; tlbsync; ptesync": : :"memory");
> +             } else {
> +                     _tlbiel_va_range_multicast(mm,
> +                                     start, end, pid, page_size, 
> mmu_virtual_psize, false);
> +                     if (hflush)
> +                             _tlbiel_va_range_multicast(mm,
> +                                     hstart, hend, pid, PMD_SIZE, 
> MMU_PAGE_2M, false);
> +                     if (gflush)
> +                             _tlbiel_va_range_multicast(mm,
> +                                     gstart, gend, pid, PUD_SIZE, 
> MMU_PAGE_1G, false);
>               }
>       }
>       preempt_enable();
> @@ -903,16 +1043,26 @@ static __always_inline void 
__radix__flush_tlb_range_psize(struct mm_struct *mm,
>               if (local) {
>                       _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : 
> RIC_FLUSH_TLB);
>               } else {
> -                     if (mm_needs_flush_escalation(mm))
> -                             also_pwc = true;
> +                     if (cputlb_use_tlbie()) {
> +                             if (mm_needs_flush_escalation(mm))
> +                                     also_pwc = true;
> +
> +                             _tlbie_pid(pid,
> +                                     also_pwc ?  RIC_FLUSH_ALL : 
> RIC_FLUSH_TLB);
> +                     } else {
> +                             _tlbiel_pid_multicast(mm, pid,
> +                                     also_pwc ?  RIC_FLUSH_ALL : 
> RIC_FLUSH_TLB);
> +                     }
>  
> -                     _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : 
> RIC_FLUSH_TLB);
>               }
>       } else {
>               if (local)
>                       _tlbiel_va_range(start, end, pid, page_size, psize, 
> also_pwc);
> -             else
> +             else if (cputlb_use_tlbie())
>                       _tlbie_va_range(start, end, pid, page_size, psize, 
> also_pwc);
> +             else
> +                     _tlbiel_va_range_multicast(mm,
> +                                     start, end, pid, page_size, psize, 
> also_pwc);
>       }
>       preempt_enable();
>  }
> @@ -954,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
*mm, unsigned long addr)
>                       exit_flush_lazy_tlbs(mm);
>                       goto local;
>               }
> -             _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, 
> true);
> +             if (cputlb_use_tlbie())
> +                     _tlbie_va_range(addr, end, pid, PAGE_SIZE, 
> mmu_virtual_psize, 
true);
> +             else
> +                     _tlbiel_va_range_multicast(mm,
> +                                     addr, end, pid, PAGE_SIZE, 
> mmu_virtual_psize, true);
>       } else {
>  local:
>               _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, 
> true);
> diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
> index 482a2c1b340a..43b312d06e3e 100644
> --- a/drivers/misc/cxl/main.c
> +++ b/drivers/misc/cxl/main.c
> @@ -18,6 +18,7 @@
>  #include <linux/sched/task.h>
>  
>  #include <asm/cputable.h>
> +#include <asm/mmu.h>
>  #include <misc/cxl-base.h>
>  
>  #include "cxl.h"
> @@ -315,6 +316,9 @@ static int __init init_cxl(void)
>  {
>       int rc = 0;
>  
> +     if (!tlbie_capable)
> +             return -EINVAL;
> +
>       if ((rc = cxl_file_init()))
>               return rc;
>  
> diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c
> index 7210d9e059be..ef73cf35dda2 100644
> --- a/drivers/misc/ocxl/main.c
> +++ b/drivers/misc/ocxl/main.c
> @@ -2,12 +2,16 @@
>  // Copyright 2017 IBM Corp.
>  #include <linux/module.h>
>  #include <linux/pci.h>
> +#include <asm/mmu.h>
>  #include "ocxl_internal.h"
>  
>  static int __init init_ocxl(void)
>  {
>       int rc = 0;
>  
> +     if (!tlbie_capable)
> +             return -EINVAL;
> +
>       rc = ocxl_file_init();
>       if (rc)
>               return rc;
>

Re: [PATCH 6/6] powerpc/64s/radix: introduce options to disable use of the tlbie instruction

Reply via email to