On Tue, Sep 04, 2018 at 06:16:01PM +1000, Nicholas Piggin wrote:
> THP paths can defer splitting compound pages until after the actual
> remap and TLB flushes to split a huge PMD/PUD. This causes radix
> partition scope page table mappings to get out of synch with the host
> qemu page table mappings.
> 
> This results in random memory corruption in the guest when running
> with THP. The easiest way to reproduce is use KVM baloon to free up
> a lot of memory in the guest and then shrink the balloon to give the
> memory back, while some work is being done in the guest.
> 
> Cc: Paul Mackerras <pau...@ozlabs.org>
> Cc: David Gibson <da...@gibson.dropbear.id.au>
> Cc: "Aneesh Kumar K.V" <aneesh.ku...@linux.ibm.com>
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Nicholas Piggin <npig...@gmail.com>

Seems to fix the problem on my test case.

Tested-by: David Gibson <da...@gibson.dropbear.id.au>

> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 88 ++++++++++----------------
>  1 file changed, 34 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 0af1c0aea1fe..d8792445d95a 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 
> struct kvm_vcpu *vcpu,
>                                  unsigned long ea, unsigned long dsisr)
>  {
>       struct kvm *kvm = vcpu->kvm;
> -     unsigned long mmu_seq, pte_size;
> -     unsigned long gpa, gfn, hva, pfn;
> +     unsigned long mmu_seq;
> +     unsigned long gpa, gfn, hva;
>       struct kvm_memory_slot *memslot;
>       struct page *page = NULL;
>       long ret;
> @@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 
> struct kvm_vcpu *vcpu,
>        */
>       hva = gfn_to_hva_memslot(memslot, gfn);
>       if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
> -             pfn = page_to_pfn(page);
>               upgrade_write = true;
>       } else {
> +             unsigned long pfn;
> +
>               /* Call KVM generic code to do the slow-path check */
>               pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
>                                          writing, upgrade_p);
> @@ -639,63 +640,42 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 
> struct kvm_vcpu *vcpu,
>               }
>       }
>  
> -     /* See if we can insert a 1GB or 2MB large PTE here */
> -     level = 0;
> -     if (page && PageCompound(page)) {
> -             pte_size = PAGE_SIZE << compound_order(compound_head(page));
> -             if (pte_size >= PUD_SIZE &&
> -                 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -                 (hva & (PUD_SIZE - PAGE_SIZE))) {
> -                     level = 2;
> -                     pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> -             } else if (pte_size >= PMD_SIZE &&
> -                        (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -                        (hva & (PMD_SIZE - PAGE_SIZE))) {
> -                     level = 1;
> -                     pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
> -             }
> -     }
> -
>       /*
> -      * Compute the PTE value that we need to insert.
> +      * Read the PTE from the process' radix tree and use that
> +      * so we get the shift and attribute bits.
>        */
> -     if (page) {
> -             pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
> -                     _PAGE_ACCESSED;
> -             if (writing || upgrade_write)
> -                     pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
> -             pte = pfn_pte(pfn, __pgprot(pgflags));
> +     local_irq_disable();
> +     ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> +     pte = *ptep;
> +     local_irq_enable();
> +
> +     /* Get pte level from shift/size */
> +     if (shift == PUD_SHIFT &&
> +         (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> +         (hva & (PUD_SIZE - PAGE_SIZE))) {
> +             level = 2;
> +     } else if (shift == PMD_SHIFT &&
> +                (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +                (hva & (PMD_SIZE - PAGE_SIZE))) {
> +             level = 1;
>       } else {
> -             /*
> -              * Read the PTE from the process' radix tree and use that
> -              * so we get the attribute bits.
> -              */
> -             local_irq_disable();
> -             ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
> -             pte = *ptep;
> -             local_irq_enable();
> -             if (shift == PUD_SHIFT &&
> -                 (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -                 (hva & (PUD_SIZE - PAGE_SIZE))) {
> -                     level = 2;
> -             } else if (shift == PMD_SHIFT &&
> -                        (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -                        (hva & (PMD_SIZE - PAGE_SIZE))) {
> -                     level = 1;
> -             } else if (shift && shift != PAGE_SHIFT) {
> -                     /* Adjust PFN */
> -                     unsigned long mask = (1ul << shift) - PAGE_SIZE;
> -                     pte = __pte(pte_val(pte) | (hva & mask));
> -             }
> -             pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> -             if (writing || upgrade_write) {
> -                     if (pte_val(pte) & _PAGE_WRITE)
> -                             pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> -             } else {
> -                     pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | 
> _PAGE_DIRTY));
> +             level = 0;
> +
> +             /* Can not cope with unknown page shift */
> +             if (shift && shift != PAGE_SHIFT) {
> +                     WARN_ON_ONCE(1);
> +                     return -EFAULT;
>               }
>       }
>  
> +     pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
> +     if (writing || upgrade_write) {
> +             if (pte_val(pte) & _PAGE_WRITE)
> +                     pte = __pte(pte_val(pte) | _PAGE_DIRTY);
> +     } else {
> +             pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
> +     }
> +
>       /* Allocate space in the tree and write the PTE */
>       ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
>  

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to