On Wed, Feb 27, 2019 at 07:42:35PM +1100, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <[email protected]>

Reviewed-by: David Gibson <[email protected]>

> ---
> 
> For NVLink2 passthrough guests with 128TiB DMA windows (when we push GPU RAM
> above the PCI MMIO window in the guest) and very fragmented system RAM
> the difference is about 16GiB of RAM before and after this patch.
> 
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 21 ++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 62 ++++++++++++++++++++++++++---
>  2 files changed, 64 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b049..281b56b 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>       unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>       for (i = 0; i < npages; i++)
> -             __free_page(stt->pages[i]);
> +             if (stt->pages[i])
> +                     __free_page(stt->pages[i]);
>  
>       kfree(stt);
>  }
> @@ -241,6 +242,12 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault 
> *vmf)
>       if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
>               return VM_FAULT_SIGBUS;
>  
> +     if (!stt->pages[vmf->pgoff]) {
> +             stt->pages[vmf->pgoff] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +             if (!stt->pages[vmf->pgoff])
> +                     return VM_FAULT_OOM;
> +     }
> +
>       page = stt->pages[vmf->pgoff];
>       get_page(page);
>       vmf->page = page;
> @@ -296,7 +303,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>       struct kvmppc_spapr_tce_table *siter;
>       unsigned long npages, size = args->size;
>       int ret = -ENOMEM;
> -     int i;
>  
>       if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>               (args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +326,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>       stt->kvm = kvm;
>       INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -     for (i = 0; i < npages; i++) {
> -             stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -             if (!stt->pages[i])
> -                     goto fail;
> -     }
> -
>       mutex_lock(&kvm->lock);
>  
>       /* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +352,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>       if (ret >= 0)
>               return ret;
>  
> - fail:
> -     for (i = 0; i < npages; i++)
> -             if (stt->pages[i])
> -                     __free_page(stt->pages[i]);
> -
>       kfree(stt);
>   fail_acct:
>       kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
> b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc7..fbb920d 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,69 @@ static u64 *kvmppc_page_address(struct page *page)
>       return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +             unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +     unsigned long i, sttpage, sttpages;
> +     unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +     if (ret)
> +             return ret;
> +     /*
> +      * clearing==true says kvmppc_tce_put won't be allocating pages
> +      * for empty tces.
> +      */
> +     if (clearing)
> +             return H_SUCCESS;
> +
> +     sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE;
> +     sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE;
> +     for (i = sttpage; i < sttpage + sttpages; ++i)
> +             if (!stt->pages[i])
> +                     return H_TOO_HARD;
> +
> +     return H_SUCCESS;
> +}
> +
>  /*
>   * Handles TCE requests for emulated devices.
>   * Puts guest TCE values to the table and expects user space to convert them.
>   * Called in both real and virtual modes.
>   * Cannot fail so kvmppc_tce_validate must be called before it.
>   *
> - * WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> + * WARNING: This will be called in real-mode on HV HPT KVM and virtual
> + *          mode on PR KVM or HV radix KVM
>   */
>  void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
>               unsigned long idx, unsigned long tce)
>  {
>       struct page *page;
>       u64 *tbl;
> +     unsigned long sttpage;
>  
>       idx -= stt->offset;
> -     page = stt->pages[idx / TCES_PER_PAGE];
> +     sttpage = idx / TCES_PER_PAGE;
> +     page = stt->pages[sttpage];
> +
> +     if (!page) {
> +             /* We allow any TCE, not just with read|write permissions */
> +             if (!tce)
> +                     return;
> +             /*
> +              * We must not end up here in real mode,
> +              * kvmppc_rm_ioba_validate() takes care of this.
> +              */
> +             page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +             if (WARN_ON_ONCE(!page))
> +                     return;
> +             stt->pages[sttpage] = page;
> +     }
>       tbl = kvmppc_page_address(page);
>  
>       tbl[idx % TCES_PER_PAGE] = tce;
> @@ -381,7 +427,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
> long liobn,
>       if (!stt)
>               return H_TOO_HARD;
>  
> -     ret = kvmppc_ioba_validate(stt, ioba, 1);
> +     ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
>       if (ret != H_SUCCESS)
>               return ret;
>  
> @@ -480,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>       if (tce_list & (SZ_4K - 1))
>               return H_PARAMETER;
>  
> -     ret = kvmppc_ioba_validate(stt, ioba, npages);
> +     ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
>       if (ret != H_SUCCESS)
>               return ret;
>  
> @@ -583,7 +629,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
>       if (!stt)
>               return H_TOO_HARD;
>  
> -     ret = kvmppc_ioba_validate(stt, ioba, npages);
> +     ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
>       if (ret != H_SUCCESS)
>               return ret;
>  
> @@ -635,6 +681,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned 
> long liobn,
>  
>       idx = (ioba >> stt->page_shift) - stt->offset;
>       page = stt->pages[idx / TCES_PER_PAGE];
> +     if (!page) {
> +             vcpu->arch.regs.gpr[4] = 0;
> +             return H_SUCCESS;
> +     }
>       tbl = (u64 *)page_address(page);
>  
>       vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to