x86: populate PVHv2 Dom0 physical memory map

Roger Pau Monne Fri, 16 Dec 2016 09:39:12 -0800

On Fri, Dec 09, 2016 at 09:48:32AM -0700, Jan Beulich wrote:
> >>> On 30.11.16 at 17:49, <roger....@citrix.com> wrote:
> > @@ -302,7 +307,8 @@ static unsigned long __init compute_dom0_nr_pages(
> >              avail -= max_pdx >> s;
> >      }
> >  
> > -    need_paging = opt_dom0_shadow || (is_pvh_domain(d) && 
> > !iommu_hap_pt_share);
> > +    need_paging = opt_dom0_shadow || (has_hvm_container_domain(d) &&
> > +                  (!iommu_hap_pt_share || !paging_mode_hap(d)));
> 
> Indentation.


Fixed.

> > @@ -545,11 +552,12 @@ static __init void pvh_map_all_iomem(struct domain 
> > *d, unsigned long nr_pages)
> >      ASSERT(nr_holes == 0);
> >  }
> >  
> > -static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> > +static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
> 
> Why?

So that afterwards I can remove all the pvh_ functions and leave the hvm_ ones
only. But seeing your response to the other patch, would you prefer me to just
use pvh_ for the new functions also?

> > @@ -577,8 +585,19 @@ static __init void pvh_setup_e820(struct domain *d, 
> > unsigned long nr_pages)
> >              continue;
> >          }
> >  
> > -        *entry_guest = *entry;
> > -        pages = PFN_UP(entry_guest->size);
> > +        /*
> > +         * Make sure the start and length are aligned to PAGE_SIZE, because
> > +         * that's the minimum granularity of the 2nd stage translation.
> > +         */
> > +        start = ROUNDUP(entry->addr, PAGE_SIZE);
> > +        end = (entry->addr + entry->size) & PAGE_MASK;
> 
> Taking the comment into consideration, I wonder whether you
> wouldn't better use PAGE_ORDER_4K here, as that's what the
> p2m code uses.

That's going to be more cumbersome, since PAGE_SIZE would become 1UL <<
PAGE_ORDER_4K << PAGE_SHIFT, and PAGE_MASK is going to be -1 and ~ of the
previous construct. But I see your point, maybe I should define PAGE_SIZE_4K
and PAGE_MASK_4K in xen/include/asm-x86/page.h?

> > @@ -1010,8 +1029,6 @@ static int __init construct_dom0_pv(
> >      BUG_ON(d->vcpu[0] == NULL);
> >      BUG_ON(v->is_initialised);
> >  
> > -    process_pending_softirqs();
> 
> Wouldn't this adjustment better fit into the previous patch, together
> with its companion below?

Yes, I guess I must have forgotten to move this.

> > +static int __init hvm_steal_ram(struct domain *d, unsigned long size,
> > +                                paddr_t limit, paddr_t *addr)
> > +{
> > +    unsigned int i = d->arch.nr_e820;
> > +
> > +    while ( i-- )
> > +    {
> > +        struct e820entry *entry = &d->arch.e820[i];
> > +
> > +        if ( entry->type != E820_RAM || entry->size < size )
> > +            continue;
> > +
> > +        /* Subtract from the beginning. */
> > +        if ( entry->addr + size < limit && entry->addr >= MB(1) )
> 
> <= I think (for the left comparison)?

Yes.

> > +static void __init hvm_steal_low_ram(struct domain *d, unsigned long start,
> > +                                     unsigned long nr_pages)
> > +{
> > +    unsigned long mfn;
> > +
> > +    ASSERT(start + nr_pages < PFN_DOWN(MB(1)));
> 
> <= again I think.

Right.

> > +static int __init hvm_setup_p2m(struct domain *d)
> > +{
> > +    struct vcpu *v = d->vcpu[0];
> > +    unsigned long nr_pages;
> > +    unsigned int i;
> > +    int rc;
> > +    bool preempted;
> > +#define MB1_PAGES PFN_DOWN(MB(1))
> > +
> > +    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
> > +
> > +    hvm_setup_e820(d, nr_pages);
> > +    do {
> > +        preempted = false;
> > +        paging_set_allocation(d, dom0_paging_pages(d, nr_pages),
> > +                              &preempted);
> > +        process_pending_softirqs();
> > +    } while ( preempted );
> > +
> > +    /*
> > +     * Memory below 1MB is identity mapped.
> > +     * NB: this only makes sense when booted from legacy BIOS.
> > +     */
> > +    rc = modify_identity_mmio(d, 0, PFN_DOWN(MB(1)), true);
> > +    if ( rc )
> > +    {
> > +        printk("Failed to identity map low 1MB: %d\n", rc);
> > +        return rc;
> > +    }
> > +
> > +    /* Populate memory map. */
> > +    for ( i = 0; i < d->arch.nr_e820; i++ )
> > +    {
> > +        unsigned long addr, size;
> > +
> > +        if ( d->arch.e820[i].type != E820_RAM )
> > +            continue;
> > +
> > +        addr = PFN_DOWN(d->arch.e820[i].addr);
> > +        size = PFN_DOWN(d->arch.e820[i].size);
> > +
> > +        if ( addr >= MB1_PAGES )
> > +            rc = hvm_populate_memory_range(d, addr, size);
> > +        else if ( addr + size > MB1_PAGES )
> > +        {
> > +            hvm_steal_low_ram(d, addr, MB1_PAGES - addr);
> > +            rc = hvm_populate_memory_range(d, MB1_PAGES,
> > +                                           size - (MB1_PAGES - addr));
> 
> Is this case possible at all? All x86 systems have some form of
> BIOS right below the 1Mb boundary, and the E820 map for
> Dom0 is being derived from the host one.

Heh, I don't think so but I wanted to cover all possible inputs. TBH I have no
idea how broken e820 memory maps can really be.

Would you be fine with removing this case and adding an ASSERT instead?

> > --- a/xen/arch/x86/mm.c
> > +++ b/xen/arch/x86/mm.c
> > @@ -475,6 +475,43 @@ void share_xen_page_with_guest(
> >      spin_unlock(&d->page_alloc_lock);
> >  }
> >  
> > +int unshare_xen_page_with_guest(struct page_info *page, struct domain *d)
> 
> __init
> 
> And once its __init, it may be possible to simplify it, as you don't need
> to fear races anymore. E.g. you wouldn't need a loop over cmpxchg().

Indeed.

> > +{
> > +    unsigned long y, x;
> > +    bool drop_dom_ref;
> > +
> > +    if ( page_get_owner(page) != d || !(page->count_info & PGC_xen_heap) )
> 
> Please don't open code is_xen_heap_page().

Right, I'm not very knowledgeable of the mm functions yet.

> > +        return -EINVAL;
> > +
> > +    spin_lock(&d->page_alloc_lock);
> > +
> > +    /* Drop the page reference so we can chanfge the owner. */
> > +    y = page->count_info;
> > +    do {
> > +        x = y;
> > +        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
> > +        {
> > +            spin_unlock(&d->page_alloc_lock);
> > +            return -EINVAL;
> > +        }
> > +        y = cmpxchg(&page->count_info, x, PGC_xen_heap);
> > +    } while ( y != x );
> > +
> > +    /* Remove the page form the list of domain pages. */
> > +    page_list_del(page, &d->xenpage_list);
> > +    drop_dom_ref = (--d->xenheap_pages == 0);
> 
> Aren't you open coding
> 
>     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
>         put_page(page);
> 
> here (except for the check on the initial value, which could be
> moved up)?

Yes, that's right.

> > +    /* Remove the owner and clear the flags. */
> > +    page_set_owner(page, NULL);
> > +    page->u.inuse.type_info = 0;
> 
> I think you'd better bail early if this is non-zero. Or else please use
> the order used elsewhere (clearing type info, then owner) - while
> it's benign, it avoids someone later wondering whether the order
> is wrong in either place.

It's certainly going to be non-zero, because share_xen_page_with_guests sets it
to:

page->u.inuse.type_info  = (readonly ? PGT_none : PGT_writable_page);
page->u.inuse.type_info |= PGT_validated | 1;

I've ended up coding it as:

int __init unshare_xen_page_with_guest(struct page_info *page,
                                       struct domain *d)
{
    if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
        return -EINVAL;

    if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
        put_page(page);

    /* Remove the owner and clear the flags. */
    page->u.inuse.type_info = 0;
    page_set_owner(page, NULL);

    return 0;
}

(note that put_page does indeed use a cmpxchg, but the benefits of not open
coding it far outweighs the penalty of using cmpxchg IMHO).

Roger.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

Re: [Xen-devel] [PATCH v4 10/14] xen/x86: populate PVHv2 Dom0 physical memory map

Reply via email to