Nicholas Piggin <npig...@gmail.com> writes:

> Rather than partition the guest PID space and catch and flush a rogue
> guest, instead work around this issue by ensuring the MMU is always
> disabled in HV mode while the guest MMU context is switched in.
>
> This may be a bit less efficient, but it is a lot less complicated and
> allows the P9 path to trivally implement the workaround too. Newer CPUs
> are not subject to this issue.
>
> Signed-off-by: Nicholas Piggin <npig...@gmail.com>
> ---
>  arch/powerpc/include/asm/mmu_context.h   |  6 ----
>  arch/powerpc/kvm/book3s_hv.c             | 10 ++++--
>  arch/powerpc/kvm/book3s_hv_interrupt.c   | 14 ++++++--
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 34 ------------------
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 27 +++++---------
>  arch/powerpc/mm/book3s64/radix_tlb.c     | 46 ------------------------
>  arch/powerpc/mm/mmu_context.c            |  4 +--
>  7 files changed, 28 insertions(+), 113 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 652ce85f9410..bb5c7e5e142e 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -122,12 +122,6 @@ static inline bool need_extra_context(struct mm_struct 
> *mm, unsigned long ea)
>  }
>  #endif
>
> -#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
> -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
> -#else
> -static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
> -#endif
> -
>  extern void switch_cop(struct mm_struct *next);
>  extern int use_cop(unsigned long acop, struct mm_struct *mm);
>  extern void drop_cop(unsigned long acop, struct mm_struct *mm);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index ad16331c3370..c3064075f1d7 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -806,6 +806,10 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, 
> unsigned long mflags,
>               /* KVM does not support mflags=2 (AIL=2) */
>               if (mflags != 0 && mflags != 3)
>                       return H_UNSUPPORTED_FLAG_START;
> +             /* Prefetch bug */
> +             if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
> +                             kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
> +                     return H_UNSUPPORTED_FLAG_START;

So does this mean that if the host has the prefetch bug, all of its
guests will run with AIL=0 all the time? And what we're avoiding here is
a guest setting AIL=3 which would (since there's no HAIL) cause
hypervisor interrupts to be taken with MMU on, is that it?

Do we need to add this verification to kvmppc_set_lpcr as well? QEMU
could in theory call the KVM_SET_ONE_REG ioctl and set AIL to any value.

>               return H_TOO_HARD;
>       default:
>               return H_TOO_HARD;
> @@ -4286,8 +4290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
>                * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
>                * path, which also handles hash and dependent threads mode.
>                */
> -             if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
> -                 !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
> +             if (kvm->arch.threads_indep && kvm_is_radix(kvm))
>                       r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
>                                                 vcpu->arch.vcore->lpcr);
>               else
> @@ -4914,6 +4917,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>               if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
>                       pr_warn("KVM: Ignoring indep_threads_mode=N in nested 
> hypervisor\n");
>                       kvm->arch.threads_indep = true;
> +             } else if (!indep_threads_mode && 
> cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
> +                     pr_warn("KVM: Ignoring indep_threads_mode=N on 
> pre-DD2.2 POWER9\n");
> +                     kvm->arch.threads_indep = true;
>               } else {
>                       kvm->arch.threads_indep = indep_threads_mode;
>               }
> diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c 
> b/arch/powerpc/kvm/book3s_hv_interrupt.c
> index b93d861d8538..9784da3f8565 100644
> --- a/arch/powerpc/kvm/book3s_hv_interrupt.c
> +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c
> @@ -223,6 +223,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
> time_limit, unsigned long lpc
>
>       mtspr(SPRN_AMOR, ~0UL);
>
> +     if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
> +             __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
> +
>       switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
>
>       /*
> @@ -231,7 +234,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
> time_limit, unsigned long lpc
>        */
>       mtspr(SPRN_HDEC, hdec);
>
> -     __mtmsrd(0, 1); /* clear RI */
> +     if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
> +             __mtmsrd(0, 1); /* clear RI */
>
>       mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
>       mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
> @@ -338,8 +342,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
> time_limit, unsigned long lpc
>
>       radix_clear_slb();
>
> -     __mtmsrd(msr, 0);
> -
>       accumulate_time(vcpu, &vcpu->arch.rm_exit);
>
>       /* Advance host PURR/SPURR by the amount used by guest */
> @@ -406,6 +408,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
> time_limit, unsigned long lpc
>
>       switch_mmu_to_host_radix(kvm, host_pidr);
>
> +     /*
> +      * If we are in real mode, don't switch MMU on until the MMU is
> +      * switched to host, to avoid the P9 radix prefetch bug.
> +      */
> +     __mtmsrd(msr, 0);
> +
>       end_timing(vcpu);
>
>       return trap;
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 6118e8a97ddd..61f71a7df238 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1710,40 +1710,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>       eieio
>       tlbsync
>       ptesync
> -
> -BEGIN_FTR_SECTION
> -     /* Radix: Handle the case where the guest used an illegal PID */
> -     LOAD_REG_ADDR(r4, mmu_base_pid)
> -     lwz     r3, VCPU_GUEST_PID(r9)
> -     lwz     r5, 0(r4)
> -     cmpw    cr0,r3,r5
> -     blt     2f
> -
> -     /*
> -      * Illegal PID, the HW might have prefetched and cached in the TLB
> -      * some translations for the  LPID 0 / guest PID combination which
> -      * Linux doesn't know about, so we need to flush that PID out of
> -      * the TLB. First we need to set LPIDR to 0 so tlbiel applies to
> -      * the right context.
> -     */
> -     li      r0,0
> -     mtspr   SPRN_LPID,r0
> -     isync
> -
> -     /* Then do a congruence class local flush */
> -     ld      r6,VCPU_KVM(r9)
> -     lwz     r0,KVM_TLB_SETS(r6)
> -     mtctr   r0
> -     li      r7,0x400                /* IS field = 0b01 */
> -     ptesync
> -     sldi    r0,r3,32                /* RS has PID */
> -1:   PPC_TLBIEL(7,0,2,1,1)           /* RIC=2, PRS=1, R=1 */
> -     addi    r7,r7,0x1000
> -     bdnz    1b
> -     ptesync
> -END_FTR_SECTION_IFSET(CPU_FTR_P9_RADIX_PREFETCH_BUG)
> -
> -2:
>  #endif /* CONFIG_PPC_RADIX_MMU */
>
>       /*
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 98f0b243c1ab..1ea95891a79e 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -357,30 +357,19 @@ static void __init radix_init_pgtable(void)
>       }
>
>       /* Find out how many PID bits are supported */
> -     if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
> -             if (!mmu_pid_bits)
> -                     mmu_pid_bits = 20;
> -             mmu_base_pid = 1;
> -     } else if (cpu_has_feature(CPU_FTR_HVMODE)) {
> -             if (!mmu_pid_bits)
> -                     mmu_pid_bits = 20;
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +     if (!cpu_has_feature(CPU_FTR_HVMODE) &&
> +                     cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
>               /*
> -              * When KVM is possible, we only use the top half of the
> -              * PID space to avoid collisions between host and guest PIDs
> -              * which can cause problems due to prefetch when exiting the
> -              * guest with AIL=3
> +              * Older versions of KVM on these machines perfer if the
> +              * guest only uses the low 19 PID bits.
>                */
> -             mmu_base_pid = 1 << (mmu_pid_bits - 1);
> -#else
> -             mmu_base_pid = 1;
> -#endif
> -     } else {
> -             /* The guest uses the bottom half of the PID space */
>               if (!mmu_pid_bits)
>                       mmu_pid_bits = 19;
> -             mmu_base_pid = 1;
> +     } else {
> +             if (!mmu_pid_bits)
> +                     mmu_pid_bits = 20;
>       }
> +     mmu_base_pid = 1;
>
>       /*
>        * Allocate Partition table and process table for the
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
> b/arch/powerpc/mm/book3s64/radix_tlb.c
> index 409e61210789..312236a6b085 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1336,49 +1336,3 @@ void radix__flush_tlb_all(void)
>                    : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : 
> "memory");
>       asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
> -
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
> -{
> -     unsigned long pid = mm->context.id;
> -
> -     if (unlikely(pid == MMU_NO_CONTEXT))
> -             return;
> -
> -     if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
> -             return;
> -
> -     /*
> -      * If this context hasn't run on that CPU before and KVM is
> -      * around, there's a slim chance that the guest on another
> -      * CPU just brought in obsolete translation into the TLB of
> -      * this CPU due to a bad prefetch using the guest PID on
> -      * the way into the hypervisor.
> -      *
> -      * We work around this here. If KVM is possible, we check if
> -      * any sibling thread is in KVM. If it is, the window may exist
> -      * and thus we flush that PID from the core.
> -      *
> -      * A potential future improvement would be to mark which PIDs
> -      * have never been used on the system and avoid it if the PID
> -      * is new and the process has no other cpumask bit set.
> -      */
> -     if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
> -             int cpu = smp_processor_id();
> -             int sib = cpu_first_thread_sibling(cpu);
> -             bool flush = false;
> -
> -             for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
> -                     if (sib == cpu)
> -                             continue;
> -                     if (!cpu_possible(sib))
> -                             continue;
> -                     if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
> -                             flush = true;
> -             }
> -             if (flush)
> -                     _tlbiel_pid(pid, RIC_FLUSH_ALL);
> -     }
> -}
> -EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
> -#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
> diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
> index 18f20da0d348..7479d39976c9 100644
> --- a/arch/powerpc/mm/mmu_context.c
> +++ b/arch/powerpc/mm/mmu_context.c
> @@ -81,9 +81,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
> mm_struct *next,
>       if (cpu_has_feature(CPU_FTR_ALTIVEC))
>               asm volatile ("dssall");
>
> -     if (new_on_cpu)
> -             radix_kvm_prefetch_workaround(next);
> -     else
> +     if (!new_on_cpu)
>               membarrier_arch_switch_mm(prev, next, tsk);
>
>       /*

Reply via email to