Nick, On Tuesday, 3 September 2019 1:29:31 AM AEST Nicholas Piggin wrote: > Introduce two options to control the use of the tlbie instruction. A > boot time option which completely disables the kernel using the > instruction, this is currently incompatible with HASH MMU, KVM, and > coherent accelerators.
Some accelerators (eg. cxl, ocxl, npu) call mm_context_add_copro() to force global TLB invalidations: static inline void mm_context_add_copro(struct mm_struct *mm) { /* * If any copro is in use, increment the active CPU count * in order to force TLB invalidations to be global as to * propagate to the Nest MMU. */ if (atomic_inc_return(&mm->context.copros) == 1) inc_mm_active_cpus(mm); } Admittedly I haven't dug into all the details of this patch but it sounds like it might break the above if TLBIE is disabled. Do you think we should add a WARN_ON if mm_context_add_copro() is called with TLBIE disabled? Or perhaps even force TLBIE to be re-enabled if it is called with it disabled? - Alistair > And a debugfs option can be switched at runtime and avoids using tlbie > for invalidating CPU TLBs for normal process and kernel address > mappings. Coherent accelerators are still managed with tlbie, as will > KVM partition scope translations. > > Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a > basic implementation which does not attempt to make any optimisation > beyond the tlbie implementation. > > This is useful for performance testing among other things. For example > in certain situations on large systems, using IPIs may be faster than > tlbie as they can be directed rather than broadcast. Later we may also > take advantage of the IPIs to do more interesting things such as trim > the mm cpumask more aggressively. > > Signed-off-by: Nicholas Piggin <npig...@gmail.com> > --- > .../admin-guide/kernel-parameters.txt | 4 + > arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 + > arch/powerpc/kvm/book3s_hv.c | 6 + > arch/powerpc/mm/book3s64/pgtable.c | 47 +++++ > arch/powerpc/mm/book3s64/radix_tlb.c | 190 ++++++++++++++++-- > drivers/misc/cxl/main.c | 4 + > drivers/misc/ocxl/main.c | 4 + > 7 files changed, 246 insertions(+), 18 deletions(-) > > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/ admin-guide/kernel-parameters.txt > index d3cbb3ae62b6..65ae16549aa3 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -860,6 +860,10 @@ > disable_radix [PPC] > Disable RADIX MMU mode on POWER9 > > + disable_tlbie [PPC] > + Disable TLBIE instruction. Currently does not work > + with KVM, with HASH MMU, or with coherent accelerators. > + > disable_cpu_apicid= [X86,APIC,SMP] > Format: <int> > The number of initial APIC ID for the > diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/ include/asm/book3s/64/tlbflush.h > index ebf572ea621e..7aa8195b6cff 100644 > --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h > +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h > @@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long addre > > radix__flush_tlb_pwc(tlb, address); > } > + > +extern bool tlbie_capable; > +extern bool tlbie_enabled; > + > +static inline bool cputlb_use_tlbie(void) > +{ > + return tlbie_enabled; > +} > + > #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index cde3f5a4b3e4..3cdaa2a09a19 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void) > static int kvmppc_book3s_init_hv(void) > { > int r; > + > + if (!tlbie_capable) { > + pr_err("KVM-HV: Host does not support TLBIE\n"); > + return -ENODEV; > + } > + > /* > * FIXME!! Do we need to check on all cpus ? > */ > diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/ pgtable.c > index 351eb78eed55..75483b40fcb1 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -8,6 +8,7 @@ > #include <linux/memblock.h> > #include <misc/cxl-base.h> > > +#include <asm/debugfs.h> > #include <asm/pgalloc.h> > #include <asm/tlb.h> > #include <asm/trace.h> > @@ -469,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, > > return true; > } > + > +/* > + * Does the CPU support tlbie? > + */ > +bool tlbie_capable __read_mostly = true; > +EXPORT_SYMBOL(tlbie_capable); > + > +/* > + * Should tlbie be used for management of CPU TLBs, for kernel and process > + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM > + * guest address spaces. > + */ > +bool tlbie_enabled __read_mostly = true; > + > +static int __init setup_disable_tlbie(char *str) > +{ > + if (!radix_enabled()) { > + pr_err("disable_tlbie: Unable to disable TLBIE with Hash > MMU.\n"); > + return 1; > + } > + > + tlbie_capable = false; > + tlbie_enabled = false; > + > + return 1; > +} > +__setup("disable_tlbie", setup_disable_tlbie); > + > +static int __init pgtable_debugfs_setup(void) > +{ > + if (!tlbie_capable) > + return 0; > + > + /* > + * There is no locking vs tlb flushing when changing this value. > + * The tlb flushers will see one value or another, and use either > + * tlbie or tlbiel with IPIs. In both cases the TLBs will be > + * invalidated as expected. > + */ > + debugfs_create_bool("tlbie_enabled", 0600, > + powerpc_debugfs_root, > + &tlbie_enabled); > + > + return 0; > +} > +arch_initcall(pgtable_debugfs_setup); > diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/ radix_tlb.c > index f9cf8ae59831..631be42abd33 100644 > --- a/arch/powerpc/mm/book3s64/radix_tlb.c > +++ b/arch/powerpc/mm/book3s64/radix_tlb.c > @@ -270,6 +270,39 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) > asm volatile("eieio; tlbsync; ptesync": : :"memory"); > } > > +struct tlbiel_pid { > + unsigned long pid; > + unsigned long ric; > +}; > + > +static void do_tlbiel_pid(void *info) > +{ > + struct tlbiel_pid *t = info; > + > + if (t->ric == RIC_FLUSH_TLB) > + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); > + else if (t->ric == RIC_FLUSH_PWC) > + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); > + else > + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); > +} > + > +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, > + unsigned long pid, unsigned long ric) > +{ > + struct cpumask *cpus = mm_cpumask(mm); > + struct tlbiel_pid t = { .pid = pid, .ric = ric }; > + > + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); > + /* > + * Always want the CPU translations to be invalidated with tlbiel in > + * these paths, so while coprocessors must use tlbie, we can not > + * optimise away the tlbiel component. > + */ > + if (atomic_read(&mm->context.copros) > 0) > + _tlbie_pid(pid, RIC_FLUSH_ALL); > +} > + > static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) > { > asm volatile("ptesync": : :"memory"); > @@ -370,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, > asm volatile("eieio; tlbsync; ptesync": : :"memory"); > } > > +struct tlbiel_va { > + unsigned long pid; > + unsigned long va; > + unsigned long psize; > + unsigned long ric; > +}; > + > +static void do_tlbiel_va(void *info) > +{ > + struct tlbiel_va *t = info; > + > + if (t->ric == RIC_FLUSH_TLB) > + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); > + else if (t->ric == RIC_FLUSH_PWC) > + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); > + else > + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); > +} > + > +static inline void _tlbiel_va_multicast(struct mm_struct *mm, > + unsigned long va, unsigned long pid, > + unsigned long psize, unsigned long ric) > +{ > + struct cpumask *cpus = mm_cpumask(mm); > + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric > }; > + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); > + if (atomic_read(&mm->context.copros) > 0) > + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); > +} > + > +struct tlbiel_va_range { > + unsigned long pid; > + unsigned long start; > + unsigned long end; > + unsigned long page_size; > + unsigned long psize; > + bool also_pwc; > +}; > + > +static void do_tlbiel_va_range(void *info) > +{ > + struct tlbiel_va_range *t = info; > + > + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, > + t->psize, t->also_pwc); > +} > + > static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, > unsigned long psize, unsigned long ric) > { > @@ -393,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, > asm volatile("eieio; tlbsync; ptesync": : :"memory"); > } > > +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, > + unsigned long start, unsigned long end, > + unsigned long pid, unsigned long page_size, > + unsigned long psize, bool also_pwc) > +{ > + struct cpumask *cpus = mm_cpumask(mm); > + struct tlbiel_va_range t = { .start = start, .end = end, > + .pid = pid, .page_size = page_size, > + .psize = psize, .also_pwc = also_pwc }; > + > + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); > + if (atomic_read(&mm->context.copros) > 0) > + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); > +} > + > /* > * Base TLB flushing operations: > * > @@ -530,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) > goto local; > } > > - if (mm_needs_flush_escalation(mm)) > - _tlbie_pid(pid, RIC_FLUSH_ALL); > - else > - _tlbie_pid(pid, RIC_FLUSH_TLB); > + if (cputlb_use_tlbie()) { > + if (mm_needs_flush_escalation(mm)) > + _tlbie_pid(pid, RIC_FLUSH_ALL); > + else > + _tlbie_pid(pid, RIC_FLUSH_TLB); > + } else { > + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); > + } > } else { > local: > _tlbiel_pid(pid, RIC_FLUSH_TLB); > @@ -559,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) > goto local; > } > } > - _tlbie_pid(pid, RIC_FLUSH_ALL); > + if (cputlb_use_tlbie()) > + _tlbie_pid(pid, RIC_FLUSH_ALL); > + else > + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); > } else { > local: > _tlbiel_pid(pid, RIC_FLUSH_ALL); > @@ -594,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, > exit_flush_lazy_tlbs(mm); > goto local; > } > - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); > + if (cputlb_use_tlbie()) > + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); > + else > + _tlbiel_va_multicast(mm, vmaddr, pid, psize, > RIC_FLUSH_TLB); > } else { > local: > _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); > @@ -616,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page); > #define radix__flush_all_mm radix__local_flush_all_mm > #endif /* CONFIG_SMP */ > > +static void do_tlbiel_kernel(void *info) > +{ > + _tlbiel_pid(0, RIC_FLUSH_ALL); > +} > + > +static inline void _tlbiel_kernel_broadcast(void) > +{ > + on_each_cpu(do_tlbiel_kernel, NULL, 1); > + if (tlbie_capable) { > + /* > + * Coherent accelerators don't refcount kernel memory mappings, > + * so have to always issue a tlbie for them. This is quite a > + * slow path anyway. > + */ > + _tlbie_pid(0, RIC_FLUSH_ALL); > + } > +} > + > /* > * If kernel TLBIs ever become local rather than global, then > * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it > @@ -623,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page); > */ > void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) > { > - _tlbie_pid(0, RIC_FLUSH_ALL); > + if (cputlb_use_tlbie()) > + _tlbie_pid(0, RIC_FLUSH_ALL); > + else > + _tlbiel_kernel_broadcast(); > } > EXPORT_SYMBOL(radix__flush_tlb_kernel_range); > > @@ -679,10 +805,14 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > if (local) { > _tlbiel_pid(pid, RIC_FLUSH_TLB); > } else { > - if (mm_needs_flush_escalation(mm)) > - _tlbie_pid(pid, RIC_FLUSH_ALL); > - else > - _tlbie_pid(pid, RIC_FLUSH_TLB); > + if (cputlb_use_tlbie()) { > + if (mm_needs_flush_escalation(mm)) > + _tlbie_pid(pid, RIC_FLUSH_ALL); > + else > + _tlbie_pid(pid, RIC_FLUSH_TLB); > + } else { > + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); > + } > } > } else { > bool hflush = flush_all_sizes; > @@ -707,8 +837,8 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > gflush = false; > } > > - asm volatile("ptesync": : :"memory"); > if (local) { > + asm volatile("ptesync": : :"memory"); > __tlbiel_va_range(start, end, pid, page_size, > mmu_virtual_psize); > if (hflush) > __tlbiel_va_range(hstart, hend, pid, > @@ -717,7 +847,8 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > __tlbiel_va_range(gstart, gend, pid, > PUD_SIZE, MMU_PAGE_1G); > asm volatile("ptesync": : :"memory"); > - } else { > + } else if (cputlb_use_tlbie()) { > + asm volatile("ptesync": : :"memory"); > __tlbie_va_range(start, end, pid, page_size, > mmu_virtual_psize); > if (hflush) > __tlbie_va_range(hstart, hend, pid, > @@ -727,6 +858,15 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, > PUD_SIZE, MMU_PAGE_1G); > fixup_tlbie(); > asm volatile("eieio; tlbsync; ptesync": : :"memory"); > + } else { > + _tlbiel_va_range_multicast(mm, > + start, end, pid, page_size, > mmu_virtual_psize, false); > + if (hflush) > + _tlbiel_va_range_multicast(mm, > + hstart, hend, pid, PMD_SIZE, > MMU_PAGE_2M, false); > + if (gflush) > + _tlbiel_va_range_multicast(mm, > + gstart, gend, pid, PUD_SIZE, > MMU_PAGE_1G, false); > } > } > preempt_enable(); > @@ -903,16 +1043,26 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, > if (local) { > _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : > RIC_FLUSH_TLB); > } else { > - if (mm_needs_flush_escalation(mm)) > - also_pwc = true; > + if (cputlb_use_tlbie()) { > + if (mm_needs_flush_escalation(mm)) > + also_pwc = true; > + > + _tlbie_pid(pid, > + also_pwc ? RIC_FLUSH_ALL : > RIC_FLUSH_TLB); > + } else { > + _tlbiel_pid_multicast(mm, pid, > + also_pwc ? RIC_FLUSH_ALL : > RIC_FLUSH_TLB); > + } > > - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : > RIC_FLUSH_TLB); > } > } else { > if (local) > _tlbiel_va_range(start, end, pid, page_size, psize, > also_pwc); > - else > + else if (cputlb_use_tlbie()) > _tlbie_va_range(start, end, pid, page_size, psize, > also_pwc); > + else > + _tlbiel_va_range_multicast(mm, > + start, end, pid, page_size, psize, > also_pwc); > } > preempt_enable(); > } > @@ -954,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) > exit_flush_lazy_tlbs(mm); > goto local; > } > - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, > true); > + if (cputlb_use_tlbie()) > + _tlbie_va_range(addr, end, pid, PAGE_SIZE, > mmu_virtual_psize, true); > + else > + _tlbiel_va_range_multicast(mm, > + addr, end, pid, PAGE_SIZE, > mmu_virtual_psize, true); > } else { > local: > _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, > true); > diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c > index 482a2c1b340a..43b312d06e3e 100644 > --- a/drivers/misc/cxl/main.c > +++ b/drivers/misc/cxl/main.c > @@ -18,6 +18,7 @@ > #include <linux/sched/task.h> > > #include <asm/cputable.h> > +#include <asm/mmu.h> > #include <misc/cxl-base.h> > > #include "cxl.h" > @@ -315,6 +316,9 @@ static int __init init_cxl(void) > { > int rc = 0; > > + if (!tlbie_capable) > + return -EINVAL; > + > if ((rc = cxl_file_init())) > return rc; > > diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c > index 7210d9e059be..ef73cf35dda2 100644 > --- a/drivers/misc/ocxl/main.c > +++ b/drivers/misc/ocxl/main.c > @@ -2,12 +2,16 @@ > // Copyright 2017 IBM Corp. > #include <linux/module.h> > #include <linux/pci.h> > +#include <asm/mmu.h> > #include "ocxl_internal.h" > > static int __init init_ocxl(void) > { > int rc = 0; > > + if (!tlbie_capable) > + return -EINVAL; > + > rc = ocxl_file_init(); > if (rc) > return rc; >