Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"

2023-11-07 Thread Ankur Arora


Steven Rostedt  writes:

> On Tue,  7 Nov 2023 13:56:53 -0800
> Ankur Arora  wrote:
>
>> This reverts commit e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8.
>>
>> Note that removing this commit reintroduces "live patches failing to
>> complete within a reasonable amount of time due to CPU-bound kthreads."
>>
>> Unfortunately this fix depends quite critically on PREEMPT_DYNAMIC and
>> existence of cond_resched() so this will need an alternate fix.
>>
>
> Then it would probably be a good idea to Cc the live patching maintainers!

Indeed. Could have sworn that I had. But clearly not.

Apologies and thanks for adding them.

>> Signed-off-by: Ankur Arora 
>> ---
>>  include/linux/livepatch.h   |   1 -
>>  include/linux/livepatch_sched.h |  29 -
>>  include/linux/sched.h   |  20 ++
>>  kernel/livepatch/core.c |   1 -
>>  kernel/livepatch/transition.c   | 107 +---
>>  kernel/sched/core.c |  64 +++
>>  6 files changed, 28 insertions(+), 194 deletions(-)
>>  delete mode 100644 include/linux/livepatch_sched.h
>>
>> diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
>> index 9b9b38e89563..293e29960c6e 100644
>> --- a/include/linux/livepatch.h
>> +++ b/include/linux/livepatch.h
>> @@ -13,7 +13,6 @@
>>  #include 
>>  #include 
>>  #include 
>> -#include 
>>
>>  #if IS_ENABLED(CONFIG_LIVEPATCH)
>>
>> diff --git a/include/linux/livepatch_sched.h 
>> b/include/linux/livepatch_sched.h
>> deleted file mode 100644
>> index 013794fb5da0..
>> --- a/include/linux/livepatch_sched.h
>> +++ /dev/null
>> @@ -1,29 +0,0 @@
>> -/* SPDX-License-Identifier: GPL-2.0-or-later */
>> -#ifndef _LINUX_LIVEPATCH_SCHED_H_
>> -#define _LINUX_LIVEPATCH_SCHED_H_
>> -
>> -#include 
>> -#include 
>> -
>> -#ifdef CONFIG_LIVEPATCH
>> -
>> -void __klp_sched_try_switch(void);
>> -
>> -#if !defined(CONFIG_PREEMPT_DYNAMIC) || 
>> !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
>> -
>> -DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
>> -
>> -static __always_inline void klp_sched_try_switch(void)
>> -{
>> -if (static_branch_unlikely(&klp_sched_try_switch_key))
>> -__klp_sched_try_switch();
>> -}
>> -
>> -#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
>> -
>> -#else /* !CONFIG_LIVEPATCH */
>> -static inline void klp_sched_try_switch(void) {}
>> -static inline void __klp_sched_try_switch(void) {}
>> -#endif /* CONFIG_LIVEPATCH */
>> -
>> -#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5bdf80136e42..c5b0ef1ecfe4 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -36,7 +36,6 @@
>>  #include 
>>  #include 
>>  #include 
>> -#include 
>>  #include 
>>
>>  /* task_struct member predeclarations (sorted alphabetically): */
>> @@ -2087,9 +2086,6 @@ extern int __cond_resched(void);
>>
>>  #if defined(CONFIG_PREEMPT_DYNAMIC) && 
>> defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
>>
>> -void sched_dynamic_klp_enable(void);
>> -void sched_dynamic_klp_disable(void);
>> -
>>  DECLARE_STATIC_CALL(cond_resched, __cond_resched);
>>
>>  static __always_inline int _cond_resched(void)
>> @@ -2098,7 +2094,6 @@ static __always_inline int _cond_resched(void)
>>  }
>>
>>  #elif defined(CONFIG_PREEMPT_DYNAMIC) && 
>> defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
>> -
>>  extern int dynamic_cond_resched(void);
>>
>>  static __always_inline int _cond_resched(void)
>> @@ -2106,25 +2101,20 @@ static __always_inline int _cond_resched(void)
>>  return dynamic_cond_resched();
>>  }
>>
>> -#else /* !CONFIG_PREEMPTION */
>> +#else
>>
>>  static inline int _cond_resched(void)
>>  {
>> -klp_sched_try_switch();
>>  return __cond_resched();
>>  }
>>
>> -#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
>> +#endif /* CONFIG_PREEMPT_DYNAMIC */
>>
>> -#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
>> +#else
>>
>> -static inline int _cond_resched(void)
>> -{
>> -klp_sched_try_switch();
>> -return 0;
>> -}
>> +static inline int _cond_resched(void) { return 0; }
>>
>> -#endif /* !CONFIG_PREEMPTION || CONFIG_PRE

Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"

2023-11-09 Thread Ankur Arora


Josh Poimboeuf  writes:

> On Thu, Nov 09, 2023 at 12:31:47PM -0500, Steven Rostedt wrote:
>> On Thu, 9 Nov 2023 09:26:37 -0800
>> Josh Poimboeuf  wrote:
>>
>> > On Tue, Nov 07, 2023 at 06:16:09PM -0500, Steven Rostedt wrote:
>> > > On Tue,  7 Nov 2023 13:56:53 -0800
>> > > Ankur Arora  wrote:
>> > >
>> > > > This reverts commit e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8.
>> > > >
>> > > > Note that removing this commit reintroduces "live patches failing to
>> > > > complete within a reasonable amount of time due to CPU-bound kthreads."
>> > > >
>> > > > Unfortunately this fix depends quite critically on PREEMPT_DYNAMIC and
>> > > > existence of cond_resched() so this will need an alternate fix.
>> >
>> > We definitely don't want to introduce a regression, something will need
>> > to be figured out before removing cond_resched().
>> >
>> > We could hook into preempt_schedule_irq(), but that wouldn't work for
>> > non-ORC.
>> >
>> > Another option would be to hook into schedule().  Then livepatch could
>> > set TIF_NEED_RESCHED on remaining unpatched tasks.  But again if they go
>> > through the preemption path then we have the same problem for non-ORC.
>> >
>> > Worst case we'll need to sprinkle cond_livepatch() everywhere :-/
>> >
>>
>> I guess I'm not fully understanding what the cond rescheds are for. But
>> would an IPI to all CPUs setting NEED_RESCHED, fix it?

Yeah. We could just temporarily toggle to full preemption, when
NEED_RESCHED_LAZY is always upgraded to NEED_RESCHED which will
then send IPIs.

> If all livepatch arches had the ORC unwinder, yes.
>
> The problem is that frame pointer (and similar) unwinders can't reliably
> unwind past an interrupt frame.

Ah, I wonder if we could just disable the preempt_schedule_irq() path
temporarily? Hooking into schedule() alongside something like this:

@@ -379,7 +379,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs 
*regs)

 void irqentry_exit_cond_resched(void)
 {
-   if (!preempt_count()) {
+   if (klp_cond_resched_disable() && !preempt_count()) {

The problem would be tasks that don't go through any preemptible
sections.

--
ankur



Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"

2023-11-09 Thread Ankur Arora


Josh Poimboeuf  writes:

> On Thu, Nov 09, 2023 at 02:50:48PM -0800, Ankur Arora wrote:
>> >> I guess I'm not fully understanding what the cond rescheds are for. But
>> >> would an IPI to all CPUs setting NEED_RESCHED, fix it?
>>
>> Yeah. We could just temporarily toggle to full preemption, when
>> NEED_RESCHED_LAZY is always upgraded to NEED_RESCHED which will
>> then send IPIs.
>>
>> > If all livepatch arches had the ORC unwinder, yes.
>> >
>> > The problem is that frame pointer (and similar) unwinders can't reliably
>> > unwind past an interrupt frame.
>>
>> Ah, I wonder if we could just disable the preempt_schedule_irq() path
>> temporarily? Hooking into schedule() alongside something like this:
>>
>> @@ -379,7 +379,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs 
>> *regs)
>>
>>  void irqentry_exit_cond_resched(void)
>>  {
>> -   if (!preempt_count()) {
>> +   if (klp_cond_resched_disable() && !preempt_count()) {
>>
>> The problem would be tasks that don't go through any preemptible
>> sections.
>
> Let me back up a bit and explain what klp is trying to do.
>
> When a livepatch is applied, klp needs to unwind all the tasks,
> preferably within a reasonable amount of time.
>
> We can't unwind task A from task B while task A is running, since task A
> could be changing the stack during the unwind.  So task A needs to be
> blocked or asleep.  The only exception to that is if the unwind happens
> in the context of task A itself.

> The problem we were seeing was CPU-bound kthreads (e.g., vhost_worker)
> not getting patched within a reasonable amount of time.  We fixed it by
> hooking the klp unwind into cond_resched() so it can unwind from the
> task itself.

Right, so the task calls schedule() itself via cond_resched() and that
works. If the task schedules out by calling preempt_enable() that
presumably works as well.

So, that leaves two paths where we can't unwind:

 1. a task never entering or leaving preemptible sections
 2. or, a task which gets preempted in irqentry_exit_cond_resched()
   This we could disable temporarily.

> It only worked because we had a non-preempted hook (because non-ORC
> unwinders can't unwind reliably through preemption) which called klp to
> unwind from the context of the task.
>
> Without something to hook into, we have a problem.  We could of course
> hook into schedule(), but if the kthread never calls schedule() from a
> non-preempted context then it still doesn't help.

Yeah agreed. The first one is a problem. And, that's a problem with the
removal of cond_resched() generally. Because the way to fix case 1 was
typically to add a cond_resched() when softlockups were seen or in
code review.

--
ankur



Re: vfio-pci: protect remap_pfn_range() from simultaneous calls

2021-02-25 Thread Ankur Arora
Hi Bharat,

Can you test the patch below to see if it works for you?

Also could you add some more detail to your earlier description of
the bug?
In particular, AFAICS you are using ODP (-DPDK?) with multiple
threads touching this region. From your stack, it looks like the
fault was user-space generated, and I'm guessing you were not
using the VFIO_IOMMU_MAP_DMA.

Ankur

-- >8 --

Subject: [PATCH] vfio-pci: protect io_remap_pfn_range() from simultaneous calls

vfio_pci_mmap_fault() maps the complete VMA on fault. With concurrent
faults, this would result in multiple calls to io_remap_pfn_range(),
where it would hit a BUG_ON(!pte_none(*pte)) in remap_pte_range().
(It would also link the same VMA multiple times in vdev->vma_list
but given the BUG_ON that is less serious.)

Normally, however, this won't happen -- at least with vfio_iommu_type1 --
the VFIO_IOMMU_MAP_DMA path is protected by iommu->lock.

If, however, we are using some kind of parallelization mechanism like
this one with ktask under discussion [1], we would hit this.
Even if we were doing this serially, given that vfio-pci remaps a larger
extent than strictly necessary it should internally enforce coherence of
its data structures.

Handle this by using the VMA's presence in the vdev->vma_list as
indicative of a fully mapped VMA and returning success early to
all but the first VMA fault. Note that this is clearly optimstic given
that the mapping is ongoing, and might mean that the caller sees
more faults until the remap is done.

[1] https://lore.kernel.org/linux-mm/20181105145141.6f993...@w520.home/

Signed-off-by: Ankur Arora 
---
 drivers/vfio/pci/vfio_pci.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 65e7e6b44578..b9f509863db1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1573,6 +1573,11 @@ static int __vfio_pci_add_vma(struct vfio_pci_device 
*vdev,
 {
struct vfio_pci_mmap_vma *mmap_vma;
 
+   list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+   if (mmap_vma->vma == vma)
+   return 1;
+   }
+
mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
if (!mmap_vma)
return -ENOMEM;
@@ -1613,6 +1618,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault 
*vmf)
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_device *vdev = vma->vm_private_data;
vm_fault_t ret = VM_FAULT_NOPAGE;
+   int vma_present;
 
mutex_lock(&vdev->vma_lock);
down_read(&vdev->memory_lock);
@@ -1623,7 +1629,21 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault 
*vmf)
goto up_out;
}
 
-   if (__vfio_pci_add_vma(vdev, vma)) {
+   /*
+* __vfio_pci_add_vma() either adds the vma to the vdev->vma_list
+* (vma_present == 0), or indicates that the vma is already present
+* on the list (vma_present == 1).
+*
+* Overload the meaning of this flag to also imply that the vma is
+* fully mapped. This allows us to serialize the mapping -- ensuring
+* that simultaneous faults will not both try to call
+* io_remap_pfn_range().
+*
+* However, this might mean that callers to which we returned success
+* optimistically will see more faults until the remap is complete.
+*/
+   vma_present = __vfio_pci_add_vma(vdev, vma);
+   if (vma_present < 0) {
ret = VM_FAULT_OOM;
mutex_unlock(&vdev->vma_lock);
goto up_out;
@@ -1631,6 +1651,9 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault 
*vmf)
 
mutex_unlock(&vdev->vma_lock);
 
+   if (vma_present)
+   goto up_out;
+
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
   vma->vm_end - vma->vm_start, vma->vm_page_prot))
ret = VM_FAULT_SIGBUS;
-- 
2.29.2



Re: [PATCH RFC 10/39] KVM: x86/xen: support upcall vector

2020-12-08 Thread Ankur Arora

On 2020-12-08 8:08 a.m., David Woodhouse wrote:

On Wed, 2020-12-02 at 19:02 +, David Woodhouse wrote:



I feel we could just accommodate it as subtype in 
KVM_XEN_ATTR_TYPE_CALLBACK_VIA.
Don't see the adavantage in having another xen attr type.


Yeah, fair enough.


But kinda have mixed feelings in having kernel handling all event channels ABI,
as opposed to only the ones userspace asked to offload. It looks a tad 
unncessary besides
the added gain to VMMs that don't need to care about how the internals of event 
channels.
But performance-wise it wouldn't bring anything better. But maybe, the former 
is reason
enough to consider it.


Yeah, we'll see. Especially when it comes to implementing FIFO event
channels, I'd rather just do it in one place — and if the kernel does
it anyway then it's hardly difficult to hook into that.

But I've been about as coherent as I can be in email, and I think we're
generally aligned on the direction. I'll do some more experiments and
see what I can get working, and what it looks like.



So... I did some more typing, and revived our completely userspace
based implementation of event channels. I wanted to declare that such
was *possible*, and that using the kernel for IPI and VIRQ was just a
very desirable optimisation.

It looks like Linux doesn't use the per-vCPU upcall vector that you
called 'KVM_XEN_CALLBACK_VIA_EVTCHN'. So I'm delivering interrupts via
KVM_INTERRUPT as if they were ExtINT

... except I'm not. Because the kernel really does expect that to be an
ExtINT from a legacy PIC, and kvm_apic_accept_pic_intr() only returns
true if LVT0 is set up for EXTINT and unmasked.

I messed around with this hack and increasingly desperate variations on
the theme (since this one doesn't cause the IRQ window to be opened to
userspace in the first place), but couldn't get anything working:


Increasingly desperate variations,  about sums up my process as well while
trying to get the upcall vector working.



--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2380,6 +2380,9 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
 r = 1;
+   /* Shoot me. */
+   if (vcpu->arch.pending_external_vector == 243)
+   r = 1;
 return r;
  }
  


Eventually I resorted to delivering the interrupt through the lapic
*anyway* (through KVM_SIGNAL_MSI with an MSI message constructed for
the appropriate vCPU/vector) and the following hack to auto-EOI:

--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2416,7 +2419,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
  */
  
 apic_clear_irr(vector, apic);

-   if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+   if (vector == 243 || test_bit(vector, 
vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
 /*
  * For auto-EOI interrupts, there might be another pending
  * interrupt above PPR, so check whether to raise another


That works, and now my guest finishes the SMP bringup (and gets as far
as waiting on the XenStore implementation that I haven't put back yet).

So I think we need at least a tiny amount of support in-kernel for
delivering event channel interrupt vectors, even if we wanted to allow
for a completely userspace implementation.

Unless I'm missing something?


I did use the auto_eoi hack as well. So, yeah, I don't see any way of
getting around this.

Also, IIRC we had eventually gotten rid of the auto_eoi approach
because that wouldn't work with APICv. At that point we resorted to
direct queuing for vectored callbacks which was a hack that I never
grew fond of...
 

I will get on with implementing the in-kernel handling with IRQ routing
entries targeting a given { port, vcpu }. And I'm kind of vacillating
about whether the mode/vector should be separately configured, or
whether they might as well be in the IRQ routing table too, even if
it's kind of redundant because it's specified the same for *every* port
targeting the same vCPU. I *think* I prefer that redundancy over having
a separate configuration mechanism to set the vector for each vCPU. But
we'll see what happens when my fingers do the typing...



Good luck to your fingers!

Ankur


Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page

2020-12-01 Thread Ankur Arora

On 2020-12-01 5:07 a.m., David Woodhouse wrote:

On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote:

+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+   struct shared_info *shared_info;
+   struct page *page;
+
+   page = gfn_to_page(kvm, gfn);
+   if (is_error_page(page))
+   return -EINVAL;
+
+   kvm->arch.xen.shinfo_addr = gfn;
+
+   shared_info = page_to_virt(page);
+   memset(shared_info, 0, sizeof(struct shared_info));
+   kvm->arch.xen.shinfo = shared_info;
+   return 0;
+}
+


Hm.

How come we get to pin the page and directly dereference it every time,
while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
instead?


So looking at my WIP trees from the time, this is something that
we went back and forth on as well with using just a pinned page or a
persistent kvm_vcpu_map().

I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
as shared_info is created early and is not expected to change during the
lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
kvm_vcpu_unmap() dance or do some kind of synchronization.

That said, I don't think this code explicitly disallows any updates
to shared_info.



If that was allowed, wouldn't it have been a much simpler fix for
CVE-2019-3016? What am I missing?


Agreed.

Perhaps, Paolo can chime in with why KVM never uses pinned page
and always prefers to do cached mappings instead?



Should I rework these to use kvm_write_guest_cached()?


kvm_vcpu_map() would be better. The event channel logic does RMW operations
on shared_info->vcpu_info.


Ankur






Re: [PATCH RFC 02/39] KVM: x86/xen: intercept xen hypercalls if enabled

2020-12-01 Thread Ankur Arora

On 2020-12-01 1:48 a.m., David Woodhouse wrote:

On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote:

Add a new exit reason for emulator to handle Xen hypercalls.
Albeit these are injected only if guest has initialized the Xen
hypercall page


I've reworked this a little.

I didn't like the inconsistency of allowing userspace to provide the
hypercall pages even though the ABI is now defined by the kernel and it
*has* to be VMCALL/VMMCALL.

So I switched it to generate the hypercall page directly from the
kernel, just like we do for the Hyper-V hypercall page.

I introduced a new flag in the xen_hvm_config struct to enable this
behaviour, and advertised it in the KVM_CAP_XEN_HVM return value.

I also added the cpl and support for 6-argument hypercalls, and made it
check the guest RIP when completing the call as discussed (although I
still think that probably ought to be a generic thing).

I adjusted the test case from my version of the patch, and added
support for actually testing the hypercall page MSR.

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv

I'll go through and rebase your patch series at least up to patch 16
and collect them in that tree, then probably post them for real once
I've got everything working locally.


===
 From c037c329c8867b219afe2100e383c62e9db7b06d Mon Sep 17 00:00:00 2001
From: Joao Martins 
Date: Wed, 13 Jun 2018 09:55:44 -0400
Subject: [PATCH] KVM: x86/xen: intercept xen hypercalls if enabled

Add a new exit reason for emulator to handle Xen hypercalls.

Since this means KVM owns the ABI, dispense with the facility for the
VMM to provide its own copy of the hypercall pages; just fill them in
directly using VMCALL/VMMCALL as we do for the Hyper-V hypercall page.

This behaviour is enabled by a new INTERCEPT_HCALL flag in the
KVM_XEN_HVM_CONFIG ioctl structure, and advertised by the same flag
being returned from the KVM_CAP_XEN_HVM check.

Add a test case and shift xen_hvm_config() to the nascent xen.c while
we're at it.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  arch/x86/include/asm/kvm_host.h   |   6 +
  arch/x86/kvm/Makefile |   2 +-
  arch/x86/kvm/trace.h  |  36 +
  arch/x86/kvm/x86.c|  46 +++---
  arch/x86/kvm/xen.c| 140 ++
  arch/x86/kvm/xen.h|  21 +++
  include/uapi/linux/kvm.h  |  19 +++
  tools/testing/selftests/kvm/Makefile  |   1 +
  tools/testing/selftests/kvm/lib/kvm_util.c|   1 +
  .../selftests/kvm/x86_64/xen_vmcall_test.c| 123 +++
  10 files changed, 365 insertions(+), 30 deletions(-)
  create mode 100644 arch/x86/kvm/xen.c
  create mode 100644 arch/x86/kvm/xen.h
  create mode 100644 tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c




diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index ..6400a4bc8480
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#include "x86.h"
+#include "xen.h"
+
+#include 
+
+#include 
+
+#include "trace.h"
+
+int kvm_xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+   {
+   struct kvm *kvm = vcpu->kvm;
+   u32 page_num = data & ~PAGE_MASK;
+   u64 page_addr = data & PAGE_MASK;
+
+   /*
+* If Xen hypercall intercept is enabled, fill the hypercall
+* page with VMCALL/VMMCALL instructions since that's what
+* we catch. Else the VMM has provided the hypercall pages
+* with instructions of its own choosing, so use those.
+*/
+   if (kvm_xen_hypercall_enabled(kvm)) {
+   u8 instructions[32];
+   int i;
+
+   if (page_num)
+   return 1;
+
+   /* mov imm32, %eax */
+   instructions[0] = 0xb8;
+
+   /* vmcall / vmmcall */
+   kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+
+   /* ret */
+   instructions[8] = 0xc3;
+
+   /* int3 to pad */
+   memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+   for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+   *(u32 *)&instructions[1] = i;
+   if (kvm_vcpu_write_guest(vcpu,
+page_addr + (i * 
sizeof(instructions)),
+instructions, 
sizeof(instructions)))
+   return 1;
+   }


HYPERVISOR_iret isn't supported on 64bit so should be ud2 instead.


Ankur


+   } else {
+   int lm = is_long_mode(vcpu);
+   u8 *blob_

Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page

2020-12-01 Thread Ankur Arora

On 2020-12-01 5:26 p.m., David Woodhouse wrote

On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote:

On 2020-12-01 5:07 a.m., David Woodhouse wrote:

On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote:

+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+   struct shared_info *shared_info;
+   struct page *page;
+
+   page = gfn_to_page(kvm, gfn);
+   if (is_error_page(page))
+   return -EINVAL;
+
+   kvm->arch.xen.shinfo_addr = gfn;
+
+   shared_info = page_to_virt(page);
+   memset(shared_info, 0, sizeof(struct shared_info));
+   kvm->arch.xen.shinfo = shared_info;
+   return 0;
+}
+


Hm.

How come we get to pin the page and directly dereference it every time,
while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
instead?


So looking at my WIP trees from the time, this is something that
we went back and forth on as well with using just a pinned page or a
persistent kvm_vcpu_map().


OK, thanks.


I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
as shared_info is created early and is not expected to change during the
lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
kvm_vcpu_unmap() dance or do some kind of synchronization.

That said, I don't think this code explicitly disallows any updates
to shared_info.


It needs to allow updates as well as disabling the shared_info pages.
We're going to need that to implement SHUTDOWN_soft_reset for kexec.

True.





If that was allowed, wouldn't it have been a much simpler fix for
CVE-2019-3016? What am I missing?


Agreed.

Perhaps, Paolo can chime in with why KVM never uses pinned page
and always prefers to do cached mappings instead?



Should I rework these to use kvm_write_guest_cached()?


kvm_vcpu_map() would be better. The event channel logic does RMW operations
on shared_info->vcpu_info.


I've ported the shared_info/vcpu_info parts and made a test case, and
was going back through to make it use kvm_write_guest_cached(). But I
should probably plug on through the evtchn bits before I do that.

I also don't see much locking on the cache, and can't convince myself
that accessing the shared_info page from multiple CPUs with
kvm_write_guest_cached() or kvm_map_gfn() is sane unless they each have
their own cache.


I think you could get a VCPU specific cache with kvm_vcpu_map().



What I have so far is at

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv


Thanks. Will take a look there.

Ankur



I'll do the event channel support tomorrow and hook it up to my actual
VMM to give it some more serious testing.



Re: [PATCH RFC 02/39] KVM: x86/xen: intercept xen hypercalls if enabled

2020-12-02 Thread Ankur Arora

On 2020-12-02 12:03 a.m., David Woodhouse wrote:

On Tue, 2020-12-01 at 21:19 -0800, Ankur Arora wrote:

+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+  page_addr + (i * 
sizeof(instructions)),
+  instructions, 
sizeof(instructions)))
+ return 1;
+ }


HYPERVISOR_iret isn't supported on 64bit so should be ud2 instead.


Yeah, I got part way through typing that part but concluded it probably
wasn't a fast path that absolutely needed to be emulated in the kernel.

The VMM can inject the UD# when it receives the hypercall.


That would work as well but if it's a straight ud2 on the hypercall
page, wouldn't the guest just execute it when/if it does a
HYPERVISOR_iret?

Ankur




I appreciate it *is* a guest-visible difference, if we're being really
pedantic, but I don't think we were even going to be able to 100% hide
the fact that it's not actually Xen.



Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page

2020-12-02 Thread Ankur Arora




On 2020-12-02 4:20 a.m., David Woodhouse wrote:

On Wed, 2020-12-02 at 10:44 +, Joao Martins wrote:

[late response - was on holiday yesterday]

On 12/2/20 12:40 AM, Ankur Arora wrote:

On 2020-12-01 5:07 a.m., David Woodhouse wrote:

On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote:

+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+   struct shared_info *shared_info;
+   struct page *page;
+
+   page = gfn_to_page(kvm, gfn);
+   if (is_error_page(page))
+   return -EINVAL;
+
+   kvm->arch.xen.shinfo_addr = gfn;
+
+   shared_info = page_to_virt(page);
+   memset(shared_info, 0, sizeof(struct shared_info));
+   kvm->arch.xen.shinfo = shared_info;
+   return 0;
+}
+


Hm.

How come we get to pin the page and directly dereference it every time,
while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
instead?


So looking at my WIP trees from the time, this is something that
we went back and forth on as well with using just a pinned page or a
persistent kvm_vcpu_map().

I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
as shared_info is created early and is not expected to change during the
lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
kvm_vcpu_unmap() dance or do some kind of synchronization.

That said, I don't think this code explicitly disallows any updates
to shared_info.



If that was allowed, wouldn't it have been a much simpler fix for
CVE-2019-3016? What am I missing?


Agreed.

Perhaps, Paolo can chime in with why KVM never uses pinned page
and always prefers to do cached mappings instead?



Part of the CVE fix to not use cached versions.

It's not a longterm pin of the page unlike we try to do here (partly due to the 
nature
of the pages we are mapping) but we still we map the gpa, RMW the steal time 
struct, and
then unmap the page.

See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: 
Make sure
KVM_VCPU_FLUSH_TLB flag is not missed").

But I am not sure it's a good idea to follow the same as record_steal_time() 
given that
this is a fairly sensitive code path for event channels.


Right. We definitely need to use atomic RMW operations (like the CVE
fix did) so the page needs to be *mapped*.

My question was about a permanent pinned mapping vs the map/unmap as we
need it that record_steal_time() does.

On IRC, Paolo told me that permanent pinning causes problems for memory
hotplug, and pointed me at the trick we do with an MMU notifier and
kvm_vcpu_reload_apic_access_page().


Okay that answers my question. Thanks for clearing that up.

Not sure of a good place to document this but it would be good to
have this written down somewhere. Maybe kvm_map_gfn()?



I'm going to stick with the pinning we have for the moment, and just
fix up the fact that it leaks the pinned pages if the guest sets the
shared_info address more than once.

At some point the apic page MMU notifier thing can be made generic, and
we can use that for this and for KVM steal time too.



Yeah, that's something that'll definitely be good to have.

Ankur


Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page

2020-12-02 Thread Ankur Arora

On 2020-12-02 2:44 a.m., Joao Martins wrote:

[late response - was on holiday yesterday]

On 12/2/20 12:40 AM, Ankur Arora wrote:

On 2020-12-01 5:07 a.m., David Woodhouse wrote:

On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote:

+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+   struct shared_info *shared_info;
+   struct page *page;
+
+   page = gfn_to_page(kvm, gfn);
+   if (is_error_page(page))
+   return -EINVAL;
+
+   kvm->arch.xen.shinfo_addr = gfn;
+
+   shared_info = page_to_virt(page);
+   memset(shared_info, 0, sizeof(struct shared_info));
+   kvm->arch.xen.shinfo = shared_info;
+   return 0;
+}
+


Hm.

How come we get to pin the page and directly dereference it every time,
while kvm_setup_pvclock_page() has to use kvm_write_guest_cached()
instead?


So looking at my WIP trees from the time, this is something that
we went back and forth on as well with using just a pinned page or a
persistent kvm_vcpu_map().

I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page()
as shared_info is created early and is not expected to change during the
lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or
MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map()
kvm_vcpu_unmap() dance or do some kind of synchronization.

That said, I don't think this code explicitly disallows any updates
to shared_info.



If that was allowed, wouldn't it have been a much simpler fix for
CVE-2019-3016? What am I missing?


Agreed.

Perhaps, Paolo can chime in with why KVM never uses pinned page
and always prefers to do cached mappings instead?


Part of the CVE fix to not use cached versions.

It's not a longterm pin of the page unlike we try to do here (partly due to the 
nature
of the pages we are mapping) but we still we map the gpa, RMW the steal time 
struct, and
then unmap the page.

See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: 
Make sure
KVM_VCPU_FLUSH_TLB flag is not missed").

But I am not sure it's a good idea to follow the same as record_steal_time() 
given that
this is a fairly sensitive code path for event channels.



Should I rework these to use kvm_write_guest_cached()?


kvm_vcpu_map() would be better. The event channel logic does RMW operations
on shared_info->vcpu_info.


Indeed, yes.

Ankur IIRC, we saw missed event channels notifications when we were using the
{write,read}_cached() version of the patch.

But I can't remember the reason it was due to, either the evtchn_pending or the 
mask
word -- which would make it not inject an upcall.


If memory serves, it was the mask. Though I don't think that we had
kvm_{write,read}_cached in use at that point -- given that they were
definitely not RMW safe.


Ankur



Joao



Re: [PATCH RFC 10/39] KVM: x86/xen: support upcall vector

2020-12-02 Thread Ankur Arora

On 2020-12-02 11:02 a.m., David Woodhouse wrote:

On Wed, 2020-12-02 at 18:34 +, Joao Martins wrote:

On 12/2/20 4:47 PM, David Woodhouse wrote:

On Wed, 2020-12-02 at 13:12 +, Joao Martins wrote:

On 12/2/20 11:17 AM, David Woodhouse wrote:

I might be more inclined to go for a model where the kernel handles the
evtchn_pending/evtchn_mask for us. What would go into the irq routing
table is { vcpu, port# } which get passed to kvm_xen_evtchn_send().


But passing port to the routing and handling the sending of events wouldn't it 
lead to
unnecessary handling of event channels which aren't handled by the kernel, 
compared to
just injecting caring about the upcall?


Well, I'm generally in favour of *not* doing things in the kernel that
don't need to be there.

But if the kernel is going to short-circuit the IPIs and VIRQs, then
it's already going to have to handle the evtchn_pending/evtchn_mask
bitmaps, and actually injecting interrupts.



Right. I was trying to point that out in the discussion we had
in next patch. But true be told, more about touting the idea of kernel
knowing if a given event channel is registered for userspace handling,
rather than fully handling the event channel.

I suppose we are able to provide both options to the VMM anyway
i.e. 1) letting them handle it enterily in userspace by intercepting
EVTCHNOP_send, or through the irq route if we want kernel to offload it.


Right. The kernel takes what it knows about and anything else goes up
to userspace.

I do like the way you've handled the vcpu binding in userspace, and the
kernel just knows that a given port goes to a given target CPU.




For the VMM
API I think we should follow the Xen model, mixing the domain-wide and
per-vCPU configuration. It's the best way to faithfully model the
behaviour a true Xen guest would experience.

So KVM_XEN_ATTR_TYPE_CALLBACK_VIA can be used to set one of
  • HVMIRQ_callback_vector, taking a vector#
  • HVMIRQ_callback_gsi for the in-kernel irqchip, taking a GSI#

And *maybe* in a later patch it could also handle
  • HVMIRQ_callback_gsi for split-irqchip, taking an eventfd
  • HVMIRQ_callback_pci_intx, taking an eventfd (or a pair, for EOI?)



Most of the Xen versions we were caring had callback_vector and
vcpu callback vector (despite Linux not using the latter). But if you're
dating back to 3.2 and 4.1 well (or certain Windows drivers), I suppose
gsi and pci-intx are must-haves.


Note sure about GSI but PCI-INTX is definitely something I've seen in
active use by customers recently. I think SLES10 will use that.


I feel we could just accommodate it as subtype in 
KVM_XEN_ATTR_TYPE_CALLBACK_VIA.
Don't see the adavantage in having another xen attr type.


Yeah, fair enough.


But kinda have mixed feelings in having kernel handling all event channels ABI,
as opposed to only the ones userspace asked to offload. It looks a tad 
unncessary besides
the added gain to VMMs that don't need to care about how the internals of event 
channels.
But performance-wise it wouldn't bring anything better. But maybe, the former 
is reason
enough to consider it.


Yeah, we'll see. Especially when it comes to implementing FIFO event
channels, I'd rather just do it in one place — and if the kernel does
it anyway then it's hardly difficult to hook into that.


Sorry I'm late to this conversation. Not a whole lot to add to what Joao
said. I would only differ with him on how much to offload.

Given that we need the fast path in the kernel anyway, I think it's simpler
to do all the event-channel bitmap only in the kernel.
This would also simplify using the kernel Xen drivers if someone eventually
decides to use them.


Ankur



But I've been about as coherent as I can be in email, and I think we're
generally aligned on the direction. I'll do some more experiments and
see what I can get working, and what it looks like.

I'm focusing on making the shinfo stuff all use kvm_map_gfn() first.



Re: vfio-pci: protect remap_pfn_range() from simultaneous calls

2021-01-20 Thread Ankur Arora
Hi Bharat,

So I don't have a final patch for you, but can you test the one below
the scissors mark? (The patch is correct, but I'm not happy that it
introduces a new lock.)

Ankur

-- >8 --

Date: Thu, 21 Jan 2021 00:04:36 +
Subject: vfio-pci: protect io_remap_pfn_range() from simultaneous calls

A fix for CVE-2020-12888 changed the mmap to be dynamic so it gets
zapped out and faulted back in based on the state of the PCI_COMMAND
register.

The original code flow was:

  vfio_iommu_type1::vfio_device_mmap()
vfio_pci::vfio_pci_mmap()
  remap_pfn_range(vma->vm_start .. vma->vm_end);

  vfio_iommu_type1::vfio_iommu_type1_ioctl(VFIO_PIN_MAP_DMA)
get_user_pages_remote()
iommu_map()

Which became:

  vfio_iommu_type1::vfio_device_mmap()
vfio_pci::vfio_pci_mmap()
 /* Setup vm->vm_ops */

  vfio_iommu_type1::vfio_iommu_type1_ioctl(VFIO_PIN_MAP_DMA)
get_user_pages_remote()
  follow_fault_pfn(vma, vaddr); /* For missing pages */
fixup_user_fault()
  handle_mm_fault()
vfio_pci::vfio_pci_mmap_fault()
  io_remap_pfn_range(vma->vm_start .. vma->vm_end);
iommu_map()

With this, simultaneous calls to VFIO_PIN_MAP_DMA for an overlapping
region, would mean potentially multiple calls to io_remap_pfn_range()
-- each of which try to remap the full extent.

This ends up hitting BUG_ON(!pte_none(*pte)) in remap_pte_range()
because we are mapping an already mapped pte.

The fix is to elide calls to io_remap_pfn_range() if the VMA is already
mapped.

Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access
on disabled memory")

Signed-off-by: Ankur Arora 
---
 drivers/vfio/pci/vfio_pci.c | 48 ++---
 drivers/vfio/pci/vfio_pci_private.h |  2 ++
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 706de3ef94bb..db7a2a716f51 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -64,6 +64,11 @@ static bool disable_denylist;
 module_param(disable_denylist, bool, 0444);
 MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling 
the denylist allows binding to devices with known errata that may lead to 
exploitable stability or security issues when accessed by untrusted users.");
 
+struct vdev_vma_priv {
+   struct vfio_pci_device *vdev;
+   bool vma_mapped;
+};
+
 static inline bool vfio_vga_disabled(void)
 {
 #ifdef CONFIG_VFIO_PCI_VGA
@@ -1527,15 +1532,20 @@ static int vfio_pci_zap_and_vma_lock(struct 
vfio_pci_device *vdev, bool try)
list_for_each_entry_safe(mmap_vma, tmp,
 &vdev->vma_list, vma_next) {
struct vm_area_struct *vma = mmap_vma->vma;
+   struct vdev_vma_priv *p;
 
if (vma->vm_mm != mm)
continue;
 
list_del(&mmap_vma->vma_next);
kfree(mmap_vma);
+   p = vma->vm_private_data;
 
+   mutex_lock(&vdev->map_lock);
zap_vma_ptes(vma, vma->vm_start,
 vma->vm_end - vma->vm_start);
+   p->vma_mapped = false;
+   mutex_unlock(&vdev->map_lock);
}
mutex_unlock(&vdev->vma_lock);
mmap_read_unlock(mm);
@@ -1591,12 +1601,19 @@ static int __vfio_pci_add_vma(struct vfio_pci_device 
*vdev,
  */
 static void vfio_pci_mmap_open(struct vm_area_struct *vma)
 {
+   struct vdev_vma_priv *p = vma->vm_private_data;
+   struct vfio_pci_device *vdev = p->vdev;
+
+   mutex_lock(&vdev->map_lock);
+   p->vma_mapped = false;
zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+   mutex_unlock(&vdev->map_lock);
 }
 
 static void vfio_pci_mmap_close(struct vm_area_struct *vma)
 {
-   struct vfio_pci_device *vdev = vma->vm_private_data;
+   struct vdev_vma_priv *p = vma->vm_private_data;
+   struct vfio_pci_device *vdev = p->vdev;
struct vfio_pci_mmap_vma *mmap_vma;
 
mutex_lock(&vdev->vma_lock);
@@ -1604,6 +1621,7 @@ static void vfio_pci_mmap_close(struct vm_area_struct 
*vma)
if (mmap_vma->vma == vma) {
list_del(&mmap_vma->vma_next);
kfree(mmap_vma);
+   kfree(p);
break;
}
}
@@ -1613,7 +1631,8 @@ static void vfio_pci_mmap_close(struct vm_area_struct 
*vma)
 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
-   struct vfio_pci_device *vdev = vma->vm_private_data;
+   struct vdev_vma_priv

Re: [EXT] Re: vfio-pci: protect remap_pfn_range() from simultaneous calls

2021-03-07 Thread Ankur Arora

On 2021-03-02 4:47 a.m., Bharat Bhushan wrote:

Hi Ankur,


-Original Message-
From: Ankur Arora 
Sent: Friday, February 26, 2021 6:24 AM
To: Bharat Bhushan 
Cc: alex.william...@redhat.com; ankur.a.ar...@oracle.com; linux-
ker...@vger.kernel.org; Sunil Kovvuri Goutham ;
termi...@gmail.com
Subject: [EXT] Re: vfio-pci: protect remap_pfn_range() from simultaneous calls

External Email

--
Hi Bharat,

Can you test the patch below to see if it works for you?


Sorry for late reply, I actually missed this email.

Reproducibility of the issue was low in my test scenario, one out of ~15 runs. 
I run it multiple times, overnight and observed no issues.


Awesome. Would you mind giving me your Tested-by for the
patch?


Also could you add some more detail to your earlier description of
the bug?


Our test case is running ODP multi-threaded application, where parent process 
maps (yes it uses MAP_DMA) the region and then child processes access same.  As 
a workaround we tried accessing the region once by parent process before 
creating other accessor threads and it worked as expected.


Thanks for the detail. So if the child processes start early -- they
might fault while the VFIO_IOMMU_MAP_DMA was going on. And, since
they only acquire mmap_lock in RO mode, both paths would end up
calling io_remap_pfn_range() via the fault handler.

Thanks
Ankur



Thanks
-Bharat


In particular, AFAICS you are using ODP (-DPDK?) with multiple
threads touching this region. From your stack, it looks like the
fault was user-space generated, and I'm guessing you were not
using the VFIO_IOMMU_MAP_DMA.

Ankur

-- >8 --

Subject: [PATCH] vfio-pci: protect io_remap_pfn_range() from simultaneous calls

vfio_pci_mmap_fault() maps the complete VMA on fault. With concurrent
faults, this would result in multiple calls to io_remap_pfn_range(),
where it would hit a BUG_ON(!pte_none(*pte)) in remap_pte_range().
(It would also link the same VMA multiple times in vdev->vma_list
but given the BUG_ON that is less serious.)

Normally, however, this won't happen -- at least with vfio_iommu_type1 --
the VFIO_IOMMU_MAP_DMA path is protected by iommu->lock.

If, however, we are using some kind of parallelization mechanism like
this one with ktask under discussion [1], we would hit this.
Even if we were doing this serially, given that vfio-pci remaps a larger
extent than strictly necessary it should internally enforce coherence of
its data structures.

Handle this by using the VMA's presence in the vdev->vma_list as
indicative of a fully mapped VMA and returning success early to
all but the first VMA fault. Note that this is clearly optimstic given
that the mapping is ongoing, and might mean that the caller sees
more faults until the remap is done.

[1] https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_linux-
2Dmm_20181105145141.6f9937f6-
40w520.home_&d=DwIDAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=PAAlWswPe7d8gHl
GbCLmy2YezyK7O3Hv_t2heGnouBw&m=3ZDXqnn9xNUCjgXwN9mHIKT7oyXu55P
U7yV2j0b-5hw&s=hiICkNtrcH4AbAWRrbkvMUylp7Bv0YHFCjxNGC6CGOk&e=

Signed-off-by: Ankur Arora 
---
  drivers/vfio/pci/vfio_pci.c | 25 -
  1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 65e7e6b44578..b9f509863db1 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1573,6 +1573,11 @@ static int __vfio_pci_add_vma(struct vfio_pci_device
*vdev,
  {
struct vfio_pci_mmap_vma *mmap_vma;

+   list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+   if (mmap_vma->vma == vma)
+   return 1;
+   }
+
mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
if (!mmap_vma)
return -ENOMEM;
@@ -1613,6 +1618,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault
*vmf)
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_device *vdev = vma->vm_private_data;
vm_fault_t ret = VM_FAULT_NOPAGE;
+   int vma_present;

mutex_lock(&vdev->vma_lock);
down_read(&vdev->memory_lock);
@@ -1623,7 +1629,21 @@ static vm_fault_t vfio_pci_mmap_fault(struct
vm_fault *vmf)
goto up_out;
}

-   if (__vfio_pci_add_vma(vdev, vma)) {
+   /*
+* __vfio_pci_add_vma() either adds the vma to the vdev->vma_list
+* (vma_present == 0), or indicates that the vma is already present
+* on the list (vma_present == 1).
+*
+* Overload the meaning of this flag to also imply that the vma is
+* fully mapped. This allows us to serialize the mapping -- ensuring
+* that simultaneous faults will not both try to call
+* io_remap_pfn_range().
+*
+* However, this might mean that callers to which we returne

Re: vfio-pci: protect remap_pfn_range() from simultaneous calls

2021-01-06 Thread Ankur Arora

On 2021-01-06 8:17 a.m., Bharat Bhushan wrote:

Hi Ankur,

We are observing below BUG_ON() with latest kernel

[10011.321645] [ cut here ]
[10011.322262] kernel BUG at mm/memory.c:1816!
[10011.323793] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[10011.326108] CPU: 2 PID: 1147 Comm: odp_l2fwd Not tainted 
5.4.74-05938-gb9598e49fe61 #15
[10011.328272] Hardware name: Marvell CN106XX board (DT)
[10011.330328] pstate: 8049 (Nzcv daif +PAN -UAO)
[10011.332402] pc : remap_pfn_range+0x1a4/0x260
[10011.334383] lr : remap_pfn_range+0x14c/0x260
[10011.335911] sp : 8000156afc10
[10011.337360] x29: 8000156afc10 x28: ffdffa24
[10011.339671] x27: 00014a241000 x26: 00218200
[10011.341984] x25: 0001489fbe00 x24: 00218204
[10011.344279] x23: 00218204 x22: 00680fc3
[10011.346539] x21: 00218204 x20: 000149d70860
[10011.348846] x19: 0041 x18: 
[10011.351064] x17:  x16: 
[10011.353304] x15:  x14: 
[10011.355519] x13:  x12: 
[10011.357812] x11:  x10: ffdfffe0
[10011.360136] x9 :  x8 : 
[10011.362414] x7 :  x6 : 04218200
[10011.364773] x5 : 0001 x4 : 
[10011.367103] x3 : ffe000328928 x2 : 016800017c240fc3
[10011.369462] x1 :  x0 : ffe000328928
[10011.371694] Call trace:
[10011.373510]  remap_pfn_range+0x1a4/0x260
[10011.375386]  vfio_pci_mmap_fault+0x9c/0x114
[10011.377346]  __do_fault+0x38/0x100
[10011.379253]  __handle_mm_fault+0x81c/0xce4
[10011.381247]  handle_mm_fault+0xb4/0x17c
[10011.383220]  do_page_fault+0x110/0x430
[10011.385188]  do_translation_fault+0x80/0x90
[10011.387069]  do_mem_abort+0x3c/0xa0
[10011.388852]  el0_da+0x20/0x24
[10011.391239] Code: eb1a02ff 5480 f9400362 b4fffe42 (d421)
[10011.393306] ---[ end trace ae8b75b32426d53c ]---
[10011.395140] note: odp_l2fwd[1147] exited with preempt_count 2

This is observed after patch "vfio-pci: Fault mmaps to enable vma tracking" 
where actual mapping delayed on page fault.
When address of same page accessed by multiple threads at/around same time by 
threads running on different cores causes page fault for same page on multiple 
cores at same time. One of the fault hander creates mapping while second hander 
find that page-table mapping already exists and leads to above kernel BUG_ON().


Yeah, that's what my fix addressed as well.



While article  https://lwn.net/Articles/828536/ suggest that you have already 
faced and fixed this issue
"- vfio-pci: protect remap_pfn_range() from simultaneous calls (Ankur  
Arora) [Orabug: 31663628] {CVE-2020-12888} {CVE-2020-12888}"

But I do not see any patch submitted or under review in upstream, hopefully I 
did not missed some discussion. Please let us know in case you already 
submitted or planning to submit fix or someone else fixed same.


No you haven't missed a discussion on this. For upstream this was more of
a theoretical race so I dallied a bit before sending the patch upstream.

I'll submit a patch soon. Also, would you mind if I ask you to run this
failing test before submission?

Thanks
Ankur



Thanks
-Bharat



Re: [PATCH RFC 00/39] x86/KVM: Xen HVM guest support

2019-02-20 Thread Ankur Arora

On 2/20/19 1:09 PM, Paolo Bonzini wrote:

On 20/02/19 21:15, Joao Martins wrote:

  2. PV Driver support (patches 17 - 39)

  We start by redirecting hypercalls from the backend to routines
  which emulate the behaviour that PV backends expect i.e. grant
  table and interdomain events. Next, we add support for late
  initialization of xenbus, followed by implementing
  frontend/backend communication mechanisms (i.e. grant tables and
  interdomain event channels). Finally, introduce xen-shim.ko,
  which will setup a limited Xen environment. This uses the added
  functionality of Xen specific shared memory (grant tables) and
  notifications (event channels).


I am a bit worried by the last patches, they seem really brittle and
prone to breakage.  I don't know Xen well enough to understand if the
lack of support for GNTMAP_host_map is fixable, but if not, you have to
define a completely different hypercall.

I assume you are aware of most of this, but just in case, here's the
flow when a backend driver wants to map a grant-reference in the
host: it allocates an empty struct page (via ballooning) and does a
map_grant_ref(GNTMAP_host_map) hypercall. In response, Xen validates the
grant-reference and maps it onto the address associated with the struct
page.
After this, from the POV of the underlying network/block drivers, these
struct pages can be used as just regular pages.

To support this in a KVM environment, where AFAICS no remapping of pages
is possible, the idea was to make minimal changes to the backend drivers
such that map_grant_ref() could just return the PFN from which the
backend could derive the struct page.

To ensure that backends -- when running in this environment -- have been
modified to deal with these new semantics, our map_grant_ref()
implementation explicitly disallows the GNTMAP_host_map flag.

Now if I'm reading you right, you would prefer something more
straightforward -- perhaps similar semantics but a new flag that
makes this behaviour explicit?



Of course, tests are missing.  You should use the
tools/testing/selftests/kvm/ framework, and ideally each patch should
come with coverage for the newly-added code.

Agreed.

Thanks
Ankur



Thanks,

Paolo



Re: [Xen-devel] [PATCH RFC 00/39] x86/KVM: Xen HVM guest support

2019-02-20 Thread Ankur Arora




On 2/20/19 3:39 PM, Marek Marczykowski-Górecki wrote:

On Wed, Feb 20, 2019 at 08:15:30PM +, Joao Martins wrote:

  2. PV Driver support (patches 17 - 39)

  We start by redirecting hypercalls from the backend to routines
  which emulate the behaviour that PV backends expect i.e. grant
  table and interdomain events. Next, we add support for late
  initialization of xenbus, followed by implementing
  frontend/backend communication mechanisms (i.e. grant tables and
  interdomain event channels). Finally, introduce xen-shim.ko,
  which will setup a limited Xen environment. This uses the added
  functionality of Xen specific shared memory (grant tables) and
  notifications (event channels).


Does it mean backends could be run in another guest, similarly as on
real Xen? AFAIK virtio doesn't allow that as virtio backends need

I'm afraid not. For now grant operations (map/unmap) can only be done
by backends to the local KVM instance.

Ankur


arbitrary write access to guest memory. But grant tables provide enough
abstraction to do that safely.






[RFC PATCH 10/16] xen/balloon: support ballooning in xenhost_t

2019-05-09 Thread Ankur Arora
Xen ballooning uses hollow struct pages (with the underlying GFNs being
populated/unpopulated via hypercalls) which are used by the grant logic
to map grants from other domains.

This patch allows the default xenhost to provide an alternate ballooning
allocation mechanism. This is expected to be useful for local xenhosts
(type xenhost_r0) because unlike Xen, where there is an external
hypervisor which can change the memory underneath a GFN, that is not
possible when the hypervisor is running in the same address space
as the entity doing the ballooning.

Co-developed-by: Ankur Arora 
Signed-off-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten_hvm.c   |  7 +++
 arch/x86/xen/enlighten_pv.c|  8 
 drivers/xen/balloon.c  | 19 ---
 drivers/xen/grant-table.c  |  4 ++--
 drivers/xen/privcmd.c  |  4 ++--
 drivers/xen/xen-selfballoon.c  |  2 ++
 drivers/xen/xenbus/xenbus_client.c |  6 +++---
 drivers/xen/xlate_mmu.c|  4 ++--
 include/xen/balloon.h  |  4 ++--
 include/xen/xenhost.h  | 19 +++
 10 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index efe483ceeb9a..a371bb9ee478 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -130,9 +130,16 @@ xenhost_ops_t xh_hvm_ops = {
.setup_shared_info = xen_hvm_init_shared_info,
.reset_shared_info = xen_hvm_reset_shared_info,
.probe_vcpu_id = xen_hvm_probe_vcpu_id,
+
+   /* We just use the default method of ballooning. */
+   .alloc_ballooned_pages = NULL,
+   .free_ballooned_pages = NULL,
 };
 
 xenhost_ops_t xh_hvm_nested_ops = {
+   /* Nested xenhosts, are disallowed ballooning */
+   .alloc_ballooned_pages = NULL,
+   .free_ballooned_pages = NULL,
 };
 
 static void __init init_hvm_pv_info(void)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 77b1a0d4aef2..2e94e02cdbb4 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1247,11 +1247,19 @@ xenhost_ops_t xh_pv_ops = {
.reset_shared_info = xen_pv_reset_shared_info,
 
.probe_vcpu_id = xen_pv_probe_vcpu_id,
+
+   /* We just use the default method of ballooning. */
+   .alloc_ballooned_pages = NULL,
+   .free_ballooned_pages = NULL,
 };
 
 xenhost_ops_t xh_pv_nested_ops = {
.cpuid_base = xen_pv_nested_cpuid_base,
.setup_hypercall_page = NULL,
+
+   /* Nested xenhosts, are disallowed ballooning */
+   .alloc_ballooned_pages = NULL,
+   .free_ballooned_pages = NULL,
 };
 
 /* First C function to be called on Xen boot */
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 5ef4d6ad920d..08becf574743 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -63,6 +63,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -583,12 +584,21 @@ static int add_ballooned_pages(int nr_pages)
  * @pages: pages returned
  * @return 0 on success, error otherwise
  */
-int alloc_xenballooned_pages(int nr_pages, struct page **pages)
+int alloc_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages)
 {
int pgno = 0;
struct page *page;
int ret;
 
+   /*
+* xenmem transactions for remote xenhost are disallowed.
+*/
+   if (xh->type == xenhost_r2)
+   return -EINVAL;
+
+   if (xh->ops->alloc_ballooned_pages)
+   return xh->ops->alloc_ballooned_pages(xh, nr_pages, pages);
+
mutex_lock(&balloon_mutex);
 
balloon_stats.target_unpopulated += nr_pages;
@@ -620,7 +630,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page 
**pages)
return 0;
  out_undo:
mutex_unlock(&balloon_mutex);
-   free_xenballooned_pages(pgno, pages);
+   free_xenballooned_pages(xh, pgno, pages);
return ret;
 }
 EXPORT_SYMBOL(alloc_xenballooned_pages);
@@ -630,10 +640,13 @@ EXPORT_SYMBOL(alloc_xenballooned_pages);
  * @nr_pages: Number of pages
  * @pages: pages to return
  */
-void free_xenballooned_pages(int nr_pages, struct page **pages)
+void free_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages)
 {
int i;
 
+   if (xh->ops->free_ballooned_pages)
+   return xh->ops->free_ballooned_pages(xh, nr_pages, pages);
+
mutex_lock(&balloon_mutex);
 
for (i = 0; i < nr_pages; i++) {
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 98af259d0d4f..ec90769907a4 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -804,7 +804,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages)
 {
int ret;
 
-   ret = alloc_xenballooned_pages(nr_pages, pages);
+   ret = alloc_xenballooned_pages(xh_default, nr_pages, pages);
if (ret < 0)
 

[RFC PATCH 13/16] drivers/xen: gnttab, evtchn, xenbus API changes

2019-05-09 Thread Ankur Arora
Mechanical changes, now most of these calls take xenhost_t *
as parameter.

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
 drivers/xen/cpu_hotplug.c | 14 ++---
 drivers/xen/gntalloc.c| 13 
 drivers/xen/gntdev.c  | 16 +++
 drivers/xen/manage.c  | 37 ++-
 drivers/xen/platform-pci.c| 12 +++-
 drivers/xen/sys-hypervisor.c  | 12 
 drivers/xen/xen-balloon.c | 10 +++---
 drivers/xen/xenfs/xenstored.c |  7 ---
 8 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index afeb94446d34..4a05bc028956 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -31,13 +31,13 @@ static void disable_hotplug_cpu(int cpu)
unlock_device_hotplug();
 }
 
-static int vcpu_online(unsigned int cpu)
+static int vcpu_online(xenhost_t *xh, unsigned int cpu)
 {
int err;
char dir[16], state[16];
 
sprintf(dir, "cpu/%u", cpu);
-   err = xenbus_scanf(xh_default, XBT_NIL, dir, "availability", "%15s", 
state);
+   err = xenbus_scanf(xh, XBT_NIL, dir, "availability", "%15s", state);
if (err != 1) {
if (!xen_initial_domain())
pr_err("Unable to read cpu state\n");
@@ -52,12 +52,12 @@ static int vcpu_online(unsigned int cpu)
pr_err("unknown state(%s) on CPU%d\n", state, cpu);
return -EINVAL;
 }
-static void vcpu_hotplug(unsigned int cpu)
+static void vcpu_hotplug(xenhost_t *xh, unsigned int cpu)
 {
if (!cpu_possible(cpu))
return;
 
-   switch (vcpu_online(cpu)) {
+   switch (vcpu_online(xh, cpu)) {
case 1:
enable_hotplug_cpu(cpu);
break;
@@ -78,7 +78,7 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch 
*watch,
cpustr = strstr(path, "cpu/");
if (cpustr != NULL) {
sscanf(cpustr, "cpu/%u", &cpu);
-   vcpu_hotplug(cpu);
+   vcpu_hotplug(watch->xh, cpu);
}
 }
 
@@ -93,7 +93,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
(void)register_xenbus_watch(xh_default, &cpu_watch);
 
for_each_possible_cpu(cpu) {
-   if (vcpu_online(cpu) == 0) {
+   if (vcpu_online(cpu_watch.xh, cpu) == 0) {
(void)cpu_down(cpu);
set_cpu_present(cpu, false);
}
@@ -114,7 +114,7 @@ static int __init setup_vcpu_hotplug_event(void)
 #endif
return -ENODEV;
 
-   register_xenstore_notifier(&xsn_cpu);
+   register_xenstore_notifier(xh_default, &xsn_cpu);
 
return 0;
 }
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index e07823886fa8..a490e4e8c854 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -79,6 +79,8 @@ static LIST_HEAD(gref_list);
 static DEFINE_MUTEX(gref_mutex);
 static int gref_size;
 
+static xenhost_t *xh;
+
 struct notify_info {
uint16_t pgoff:12;/* Bits 0-11: Offset of the byte to clear */
uint16_t flags:2; /* Bits 12-13: Unmap notification flags */
@@ -144,7 +146,7 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
}
 
/* Grant foreign access to the page. */
-   rc = gnttab_grant_foreign_access(op->domid,
+   rc = gnttab_grant_foreign_access(xh, op->domid,
 xen_page_to_gfn(gref->page),
 readonly);
if (rc < 0)
@@ -196,13 +198,13 @@ static void __del_gref(struct gntalloc_gref *gref)
gref->notify.flags = 0;
 
if (gref->gref_id) {
-   if (gnttab_query_foreign_access(gref->gref_id))
+   if (gnttab_query_foreign_access(xh, gref->gref_id))
return;
 
-   if (!gnttab_end_foreign_access_ref(gref->gref_id, 0))
+   if (!gnttab_end_foreign_access_ref(xh, gref->gref_id, 0))
return;
 
-   gnttab_free_grant_reference(gref->gref_id);
+   gnttab_free_grant_reference(xh, gref->gref_id);
}
 
gref_size--;
@@ -586,6 +588,9 @@ static int __init gntalloc_init(void)
if (!xen_domain())
return -ENODEV;
 
+   /* Limit to default xenhost for now. */
+   xh = xh_default;
+
err = misc_register(&gntalloc_miscdev);
if (err != 0) {
pr_err("Could not register misc gntalloc device\n");
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 0f0c951cd5b1..40a42abe2dd0 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -67,6 +67,8 @@ static atomic_t pa

[RFC PATCH 14/16] xen/blk: gnttab, evtchn, xenbus API changes

2019-05-09 Thread Ankur Arora
For the most part, we now pass xenhost_t * as a parameter.

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
 drivers/block/xen-blkback/blkback.c |  34 +
 drivers/block/xen-blkback/common.h  |   2 +-
 drivers/block/xen-blkback/xenbus.c  |  63 -
 drivers/block/xen-blkfront.c| 103 +++-
 4 files changed, 107 insertions(+), 95 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c 
b/drivers/block/xen-blkback/blkback.c
index 7ad4423c24b8..d366a17a4bd8 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -142,7 +142,7 @@ static inline bool persistent_gnt_timeout(struct 
persistent_gnt *persistent_gnt)
HZ * xen_blkif_pgrant_timeout);
 }
 
-static inline int get_free_page(struct xen_blkif_ring *ring, struct page 
**page)
+static inline int get_free_page(xenhost_t *xh, struct xen_blkif_ring *ring, 
struct page **page)
 {
unsigned long flags;
 
@@ -150,7 +150,7 @@ static inline int get_free_page(struct xen_blkif_ring 
*ring, struct page **page)
if (list_empty(&ring->free_pages)) {
BUG_ON(ring->free_pages_num != 0);
spin_unlock_irqrestore(&ring->free_pages_lock, flags);
-   return gnttab_alloc_pages(1, page);
+   return gnttab_alloc_pages(xh, 1, page);
}
BUG_ON(ring->free_pages_num == 0);
page[0] = list_first_entry(&ring->free_pages, struct page, lru);
@@ -174,7 +174,7 @@ static inline void put_free_pages(struct xen_blkif_ring 
*ring, struct page **pag
spin_unlock_irqrestore(&ring->free_pages_lock, flags);
 }
 
-static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
+static inline void shrink_free_pagepool(xenhost_t *xh, struct xen_blkif_ring 
*ring, int num)
 {
/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
struct page *page[NUM_BATCH_FREE_PAGES];
@@ -190,14 +190,14 @@ static inline void shrink_free_pagepool(struct 
xen_blkif_ring *ring, int num)
ring->free_pages_num--;
if (++num_pages == NUM_BATCH_FREE_PAGES) {
spin_unlock_irqrestore(&ring->free_pages_lock, flags);
-   gnttab_free_pages(num_pages, page);
+   gnttab_free_pages(xh, num_pages, page);
spin_lock_irqsave(&ring->free_pages_lock, flags);
num_pages = 0;
}
}
spin_unlock_irqrestore(&ring->free_pages_lock, flags);
if (num_pages != 0)
-   gnttab_free_pages(num_pages, page);
+   gnttab_free_pages(xh, num_pages, page);
 }
 
 #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
@@ -301,8 +301,8 @@ static void put_persistent_gnt(struct xen_blkif_ring *ring,
atomic_dec(&ring->persistent_gnt_in_use);
 }
 
-static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root 
*root,
- unsigned int num)
+static void free_persistent_gnts(xenhost_t *xh, struct xen_blkif_ring *ring,
+   struct rb_root *root, unsigned int num)
 {
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -314,6 +314,7 @@ static void free_persistent_gnts(struct xen_blkif_ring 
*ring, struct rb_root *ro
unmap_data.pages = pages;
unmap_data.unmap_ops = unmap;
unmap_data.kunmap_ops = NULL;
+   unmap_data.xh = xh;
 
foreach_grant_safe(persistent_gnt, n, root, node) {
BUG_ON(persistent_gnt->handle ==
@@ -351,10 +352,12 @@ void xen_blkbk_unmap_purged_grants(struct work_struct 
*work)
int segs_to_unmap = 0;
struct xen_blkif_ring *ring = container_of(work, typeof(*ring), 
persistent_purge_work);
struct gntab_unmap_queue_data unmap_data;
+   struct xenbus_device *dev = xen_blkbk_xenbus(ring->blkif->be);
 
unmap_data.pages = pages;
unmap_data.unmap_ops = unmap;
unmap_data.kunmap_ops = NULL;
+   unmap_data.xh = dev->xh;
 
while(!list_empty(&ring->persistent_purge_list)) {
persistent_gnt = list_first_entry(&ring->persistent_purge_list,
@@ -615,6 +618,7 @@ int xen_blkif_schedule(void *arg)
struct xen_vbd *vbd = &blkif->vbd;
unsigned long timeout;
int ret;
+   struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 
set_freezable();
while (!kthread_should_stop()) {
@@ -657,7 +661,7 @@ int xen_blkif_schedule(void *arg)
}
 
/* Shrink if we have more than xen_blkif_max_buffer_pages */
-   shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
+   shrink_free_pagepool(dev->xh, 

[RFC PATCH 03/16] x86/xen: make hypercall_page generic

2019-05-09 Thread Ankur Arora
Make hypercall_page a generic interface which can be implemented
by other hypervisors. With this change, hypercall_page now points to
the newly introduced xen_hypercall_page which is seeded by Xen, or
to one that is filled in by a different hypervisor.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/xen/hypercall.h | 12 +++-
 arch/x86/xen/enlighten.c |  1 +
 arch/x86/xen/enlighten_hvm.c |  3 ++-
 arch/x86/xen/enlighten_pv.c  |  1 +
 arch/x86/xen/enlighten_pvh.c |  3 ++-
 arch/x86/xen/xen-asm_32.S|  2 +-
 arch/x86/xen/xen-asm_64.S|  2 +-
 arch/x86/xen/xen-head.S  |  8 
 8 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypercall.h 
b/arch/x86/include/asm/xen/hypercall.h
index ef05bea7010d..1a3cd6680e6f 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -86,11 +86,13 @@ struct xen_dm_op_buf;
  * there aren't more than 5 arguments...)
  */
 
-extern struct { char _entry[32]; } hypercall_page[];
+struct hypercall_entry { char _entry[32]; };
+extern struct hypercall_entry xen_hypercall_page[128];
+extern struct hypercall_entry *hypercall_page;
 
-#define __HYPERCALL"call hypercall_page+%c[offset]"
+#define __HYPERCALLCALL_NOSPEC
 #define __HYPERCALL_ENTRY(x)   \
-   [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+   [thunk_target] "0" (hypercall_page + __HYPERVISOR_##x)
 
 #ifdef CONFIG_X86_32
 #define __HYPERCALL_RETREG "eax"
@@ -116,7 +118,7 @@ extern struct { char _entry[32]; } hypercall_page[];
register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
 
-#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT
+#define __HYPERCALL_0PARAM "=&r" (__res), ASM_CALL_CONSTRAINT
 #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
 #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
 #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
@@ -208,7 +210,7 @@ xen_single_call(unsigned int call,
 
asm volatile(CALL_NOSPEC
 : __HYPERCALL_5PARAM
-: [thunk_target] "a" (&hypercall_page[call])
+: [thunk_target] "0" (hypercall_page + call)
 : __HYPERCALL_CLOBBER5);
 
return (long)__res;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 750f46ad018a..e9dc92e79afa 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
 #include "smp.h"
 #include "pmu.h"
 
+struct hypercall_entry *hypercall_page;
 EXPORT_SYMBOL_GPL(hypercall_page);
 
 /*
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index ffc5791675b2..4d85cd2ff261 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -115,8 +115,9 @@ static void __init init_hvm_pv_info(void)
 
pv_info.name = "Xen HVM";
msr = cpuid_ebx(base + 2);
-   pfn = __pa(hypercall_page);
+   pfn = __pa(xen_hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+   hypercall_page = xen_hypercall_page;
}
 
xen_setup_features();
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index a4e04b0cc596..3239e8452ede 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1217,6 +1217,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
if (!xen_start_info)
return;
+   hypercall_page = xen_hypercall_page;
 
xenhost_register(xenhost_r1, &xh_pv_ops);
 
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index c07eba169572..e47866fcb7ea 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -46,8 +46,9 @@ void __init xen_pvh_init(void)
xen_start_flags = pvh_start_info.flags;
 
msr = cpuid_ebx(xen_cpuid_base() + 2);
-   pfn = __pa(hypercall_page);
+   pfn = __pa(xen_hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+   hypercall_page = xen_hypercall_page;
 }
 
 void __init mem_map_via_hcall(struct boot_params *boot_params_p)
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index c15db060a242..ee4998055ea9 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -121,7 +121,7 @@ xen_iret_end_crit:
 
 hyper_iret:
/* put this out of line since its very rarely used */
-   jmp hypercall_page + __HYPERVISOR_iret * 32
+   jmp xen_hypercall_page + __HYPERVISOR_iret * 32
 
.globl xen_iret_start_crit, xen_iret_end_crit
 
d

[RFC PATCH 16/16] xen/grant-table: host_addr fixup in mapping on xenhost_r0

2019-05-09 Thread Ankur Arora
Xenhost type xenhost_r0 does not support standard GNTTABOP_map_grant_ref
semantics (map a gref onto a specified host_addr). That's because
since the hypervisor is local (same address space as the caller of
GNTTABOP_map_grant_ref), there is no external entity that could
map an arbitrary page underneath an arbitrary address.

To handle this, the GNTTABOP_map_grant_ref hypercall on xenhost_r0
treats the host_addr as an OUT parameter instead of IN and expects the
gnttab_map_refs() and similar to fixup any state that caches the
value of host_addr from before the hypercall.

Accordingly gnttab_map_refs() now adds two parameters, a fixup function
and a pointer to cached maps to fixup:
 int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
-   struct page **pages, unsigned int count)
+   struct page **pages, gnttab_map_fixup_t map_fixup_fn,
+   void **map_fixup[], unsigned int count)

The reason we use a fixup function and not an additional mapping op
in the xenhost_t is because, depending on the caller, what we are fixing
might be different: blkback, netback for instance cache host_addr in
via a struct page *, while __xenbus_map_ring() caches a phys_addr.

This patch fixes up xen-blkback and xen-gntdev drivers.

TODO:
  - also rewrite gnttab_batch_map() and __xenbus_map_ring().
  - modify xen-netback, scsiback, pciback etc

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
 drivers/block/xen-blkback/blkback.c | 14 +-
 drivers/xen/gntdev.c|  2 +-
 drivers/xen/grant-table.c   | 20 ++--
 include/xen/grant_table.h   | 11 ++-
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c 
b/drivers/block/xen-blkback/blkback.c
index d366a17a4bd8..50ce40ba35e5 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -806,11 +806,18 @@ static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
}
 }
 
+static void blkbk_map_fixup(uint64_t host_addr, void **fixup)
+{
+   struct page **pg = (struct page **)fixup;
+   *pg = virt_to_page(host_addr);
+}
+
 static int xen_blkbk_map(struct xen_blkif_ring *ring,
 struct grant_page *pages[],
 int num, bool ro)
 {
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+   struct page **map_fixup[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt = NULL;
phys_addr_t addr = 0;
@@ -858,6 +865,9 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring,
gnttab_set_map_op(&map[segs_to_map++], addr,
  flags, pages[i]->gref,
  blkif->domid);
+
+   if (gnttab_map_fixup(dev->xh))
+ map_fixup[i] = &pages[i]->page;
}
map_until = i + 1;
if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
@@ -865,7 +875,9 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring,
}
 
if (segs_to_map) {
-   ret = gnttab_map_refs(dev->xh, map, NULL, pages_to_gnt, 
segs_to_map);
+   ret = gnttab_map_refs(dev->xh, map, NULL, pages_to_gnt,
+   gnttab_map_fixup(dev->xh) ? blkbk_map_fixup : NULL,
+   (void ***) map_fixup, segs_to_map);
BUG_ON(ret);
}
 
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 40a42abe2dd0..32c6471834ba 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -342,7 +342,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map)
 
pr_debug("map %d+%d\n", map->index, map->count);
err = gnttab_map_refs(xh, map->map_ops, use_ptemod ? map->kmap_ops : 
NULL,
-   map->pages, map->count);
+   map->pages, NULL, NULL, map->count);
if (err)
return err;
 
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 959b81ade113..2f3a0a4a2660 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -1084,7 +1084,8 @@ void gnttab_foreach_grant(struct page **pages,
 
 int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
-   struct page **pages, unsigned int count)
+   struct page **pages, gnttab_map_fixup_t map_fixup_fn,
+   void **map_fixup[], unsigned int count)
 {
int i, ret;
 
@@ -1096,12 +1097,19 @@ int gnttab_map_refs(xenhost_t *xh, struct 
gnttab_map_grant_ref *map_ops,

[RFC PATCH 02/16] x86/xen: cpuid support in xenhost_t

2019-05-09 Thread Ankur Arora
xen_cpuid_base() is used to probe and setup features early in a
guest's lifetime.

We want this to behave differently depending on xenhost->type: for
instance, local xenhosts cannot intercept the cpuid instruction at all.

Add op (*cpuid_base)() in xenhost_ops_t.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/xen/hypervisor.h |  2 +-
 arch/x86/pci/xen.c|  2 +-
 arch/x86/xen/enlighten_hvm.c  |  7 +--
 arch/x86/xen/enlighten_pv.c   | 16 +++-
 arch/x86/xen/enlighten_pvh.c  |  4 
 drivers/tty/hvc/hvc_xen.c |  2 +-
 drivers/xen/grant-table.c |  3 ++-
 drivers/xen/xenbus/xenbus_xs.c|  3 ++-
 include/xen/xenhost.h | 21 +
 9 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypervisor.h 
b/arch/x86/include/asm/xen/hypervisor.h
index 39171b3646bb..6c4cdcdf997d 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -53,7 +53,7 @@ static inline bool xen_x2apic_para_available(void)
 #else
 static inline bool xen_x2apic_para_available(void)
 {
-   return (xen_cpuid_base() != 0);
+   return (xen_cpuid_base(NULL) != 0);
 }
 #endif
 
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 9112d1cb397b..d1a3b9f08289 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -431,7 +431,7 @@ void __init xen_msi_init(void)
 * event channels for MSI handling and instead use regular
 * APIC processing
 */
-   uint32_t eax = cpuid_eax(xen_cpuid_base() + 4);
+   uint32_t eax = cpuid_eax(xenhost_cpuid_base(xh_default) + 4);
 
if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) ||
((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && 
boot_cpu_has(X86_FEATURE_APIC)))
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 100452f4f44c..ffc5791675b2 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -83,7 +83,10 @@ static void __init xen_hvm_init_mem_mapping(void)
xen_vcpu_info_reset(0);
 }
 
+extern uint32_t xen_pv_cpuid_base(xenhost_t *xh);
+
 xenhost_ops_t xh_hvm_ops = {
+   .cpuid_base = xen_pv_cpuid_base,
 };
 
 xenhost_ops_t xh_hvm_nested_ops = {
@@ -94,7 +97,7 @@ static void __init init_hvm_pv_info(void)
int major, minor;
uint32_t eax, ebx, ecx, edx, base;
 
-   base = xen_cpuid_base();
+   base = xenhost_cpuid_base(xh_default);
eax = cpuid_eax(base + 1);
 
major = eax >> 16;
@@ -250,7 +253,7 @@ static uint32_t __init xen_platform_hvm(void)
if (xen_pv_domain() || xen_nopv)
return 0;
 
-   return xen_cpuid_base();
+   return xenhost_cpuid_base(xh_default);
 }
 
 static __init void xen_hvm_guest_late_init(void)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index bb6e811c1525..a4e04b0cc596 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1189,10 +1189,23 @@ static void __init xen_dom0_set_legacy_features(void)
x86_platform.legacy.rtc = 1;
 }
 
+uint32_t xen_pv_cpuid_base(xenhost_t *xh)
+{
+   return hypervisor_cpuid_base("XenVMMXenVMM", 2);
+}
+
+uint32_t xen_pv_nested_cpuid_base(xenhost_t *xh)
+{
+   return hypervisor_cpuid_base("XenVMMXenVMM",
+   2 /* nested specific leaf? */);
+}
+
 xenhost_ops_t xh_pv_ops = {
+   .cpuid_base = xen_pv_cpuid_base,
 };
 
 xenhost_ops_t xh_pv_nested_ops = {
+   .cpuid_base = xen_pv_nested_cpuid_base,
 };
 
 /* First C function to be called on Xen boot */
@@ -1469,7 +1482,8 @@ static int xen_cpu_dead_pv(unsigned int cpu)
 static uint32_t __init xen_platform_pv(void)
 {
if (xen_pv_domain())
-   return xen_cpuid_base();
+   /* xenhost is setup in xen_start_kernel. */
+   return xenhost_cpuid_base(xh_default);
 
return 0;
 }
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 826c296d27a3..c07eba169572 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -29,6 +29,10 @@ void __init xen_pvh_init(void)
u32 msr;
u64 pfn;
 
+   /*
+* Note: we have already called xen_cpuid_base() in
+* hypervisor_specific_init()
+*/
xenhost_register(xenhost_r1, &xh_hvm_ops);
 
/*
diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c
index dc43fa96c3de..5e5ca35d7187 100644
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@@ -595,7 +595,7 @@ console_initcall(xen_cons_init);
 #ifdef CONFIG_X86
 static void xen_hvm_early_write(uint32_t vtermno, const char *str, int len)
 {
-   if (xen_cpuid_base())
+   if (xen_cpuid_base(xh_default))
outsb(0xe9, str, len);
 }
 #else
diff --git

[RFC PATCH 00/16] xenhost support

2019-05-09 Thread Ankur Arora
Hi all,

This is an RFC for xenhost support, outlined here by Juergen here:
https://lkml.org/lkml/2019/4/8/67.

The high level idea is to provide an abstraction of the Xen
communication interface, as a xenhost_t.

xenhost_t expose ops for communication between the guest and Xen
(hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on top
of those, xenbus, ballooning), and these can differ based on the kind
of underlying Xen: regular, local, and nested.

(Since this abstraction is largely about guest -- xenhost communication,
no ops are needed for timer, clock, sched, memory (MMU, P2M), VCPU mgmt.
etc.)

Xenhost use-cases:

Regular-Xen: the standard Xen interface presented to a guest,
specifically for comunication between Lx-guest and Lx-Xen.

Local-Xen: a Xen like interface which runs in the same address space as
the guest (dom0). This, can act as the default xenhost.

The major ways it differs from a regular Xen interface is in presenting
a different hypercall interface (call instead of a syscall/vmcall), and
in an inability to do grant-mappings: since local-Xen exists in the same
address space as Xen, there's no way for it to cheaply change the
physical page that a GFN maps to (assuming no P2M tables.)

Nested-Xen: this channel is to Xen, one level removed: from L1-guest to
L0-Xen. The use case is that we want L0-dom0-backends to talk to
L1-dom0-frontend drivers which can then present PV devices which can
in-turn be used by the L1-dom0-backend drivers as raw underlying devices.
The interfaces themselves, broadly remain similar.

Note: L0-Xen, L1-Xen represent Xen running at that nesting level
and L0-guest, L1-guest represent guests that are children of Xen
at that nesting level. Lx, represents any level.

Patches 1-7,
  "x86/xen: add xenhost_t interface"
  "x86/xen: cpuid support in xenhost_t"
  "x86/xen: make hypercall_page generic"
  "x86/xen: hypercall support for xenhost_t"
  "x86/xen: add feature support in xenhost_t"
  "x86/xen: add shared_info support to xenhost_t"
  "x86/xen: make vcpu_info part of xenhost_t"
abstract out interfaces that setup hypercalls/cpuid/shared_info/vcpu_info etc.

Patch 8, "x86/xen: irq/upcall handling with multiple xenhosts"
sets up the upcall and pv_irq ops based on vcpu_info.

Patch 9, "xen/evtchn: support evtchn in xenhost_t" adds xenhost based
evtchn support for evtchn_2l.

Patches 10 and 16, "xen/balloon: support ballooning in xenhost_t" and
"xen/grant-table: host_addr fixup in mapping on xenhost_r0"
implement support from GNTTABOP_map_grant_ref for xenhosts of type
xenhost_r0 (xenhost local.)

Patch 12, "xen/xenbus: support xenbus frontend/backend with xenhost_t"
makes xenbus so that both its frontend and backend can be bootstrapped
separately via separate xenhosts.

Remaining patches, 11, 13, 14, 15:
  "xen/grant-table: make grant-table xenhost aware"
  "drivers/xen: gnttab, evtchn, xenbus API changes"
  "xen/blk: gnttab, evtchn, xenbus API changes"
  "xen/net: gnttab, evtchn, xenbus API changes"
are mostly mechanical changes for APIs that now take xenhost_t *
as parameter.

The code itself is RFC quality, and is mostly meant to get feedback before
proceeding further. Also note that the FIFO logic and some Xen drivers
(input, pciback, scsi etc) are mostly unchanged, so will not build.


Please take a look.

Thanks
Ankur


Ankur Arora (16):

  x86/xen: add xenhost_t interface
  x86/xen: cpuid support in xenhost_t
  x86/xen: make hypercall_page generic
  x86/xen: hypercall support for xenhost_t
  x86/xen: add feature support in xenhost_t
  x86/xen: add shared_info support to xenhost_t
  x86/xen: make vcpu_info part of xenhost_t
  x86/xen: irq/upcall handling with multiple xenhosts
  xen/evtchn: support evtchn in xenhost_t
  xen/balloon: support ballooning in xenhost_t
  xen/grant-table: make grant-table xenhost aware
  xen/xenbus: support xenbus frontend/backend with xenhost_t
  drivers/xen: gnttab, evtchn, xenbus API changes
  xen/blk: gnttab, evtchn, xenbus API changes
  xen/net: gnttab, evtchn, xenbus API changes
  xen/grant-table: host_addr fixup in mapping on xenhost_r0

 arch/x86/include/asm/xen/hypercall.h   | 239 +---
 arch/x86/include/asm/xen/hypervisor.h  |   3 +-
 arch/x86/pci/xen.c |  18 +-
 arch/x86/xen/Makefile  |   3 +-
 arch/x86/xen/enlighten.c   | 101 ++--
 arch/x86/xen/enlighten_hvm.c   | 185 --
 arch/x86/xen/enlighten_pv.c| 144 -
 arch/x86/xen/enlighten_pvh.c   |  25 +-
 arch/x86/xen/grant-table.c |  71 ++-
 arch/x86/xen/irq.c |  75 ++-
 arch/x86/xen/mmu_pv.c  |   6 +-
 arch/x86/xen/p2m.c |  24 +-
 arch/x86/xen/pci-swiotlb-xen.c |   1 +
 arch/x86/xen/s

[RFC PATCH 07/16] x86/xen: make vcpu_info part of xenhost_t

2019-05-09 Thread Ankur Arora
Abstract out xen_vcpu_id probing via (*probe_vcpu_id)(). Once that is
availab,e the vcpu_info registration happens via the VCPUOP hypercall.

Note that for the nested case, there are two vcpu_ids, and two vcpu_info
areas, one each for the default xenhost and the remote xenhost.
The vcpu_info is used via pv_irq_ops, and evtchn signaling.

The other VCPUOP hypercalls are used for management (and scheduling)
which is expected to be done purely in the default hypervisor.
However, scheduling of L1-guest does imply L0-Xen-vcpu_info switching,
which might mean that the remote hypervisor needs some visibility
into related events/hypercalls in the default hypervisor.

TODO:
  - percpu data structures for xen_vcpu

Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten.c | 93 +---
 arch/x86/xen/enlighten_hvm.c | 87 ++
 arch/x86/xen/enlighten_pv.c  | 60 ++---
 arch/x86/xen/enlighten_pvh.c |  3 +-
 arch/x86/xen/irq.c   | 10 ++--
 arch/x86/xen/mmu_pv.c|  6 +--
 arch/x86/xen/pci-swiotlb-xen.c   |  1 +
 arch/x86/xen/setup.c |  1 +
 arch/x86/xen/smp.c   |  9 +++-
 arch/x86/xen/smp_hvm.c   | 17 +++---
 arch/x86/xen/smp_pv.c| 12 ++---
 arch/x86/xen/time.c  | 23 
 arch/x86/xen/xen-ops.h   |  5 +-
 drivers/xen/events/events_base.c | 14 ++---
 drivers/xen/events/events_fifo.c |  2 +-
 drivers/xen/evtchn.c |  2 +-
 drivers/xen/time.c   |  2 +-
 include/xen/xen-ops.h|  7 +--
 include/xen/xenhost.h| 47 
 19 files changed, 240 insertions(+), 161 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 20e0de82..0dafbbc838ef 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,35 +20,6 @@
 #include "smp.h"
 #include "pmu.h"
 
-/*
- * Pointer to the xen_vcpu_info structure or
- * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
- * and xen_vcpu_setup for details. By default it points to 
share_info->vcpu_info
- * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
- * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
- * acknowledge pending events.
- * Also more subtly it is used by the patched version of irq enable/disable
- * e.g. xen_irq_enable_direct and xen_iret in PV mode.
- *
- * The desire to be able to do those mask/unmask operations as a single
- * instruction by using the per-cpu offset held in %gs is the real reason
- * vcpu info is in a per-cpu pointer and the original reason for this
- * hypercall.
- *
- */
-DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-
-/*
- * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
- * hypercall. This can be used both in PV and PVHVM mode. The structure
- * overrides the default per_cpu(xen_vcpu, cpu) value.
- */
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-
-/* Linux <-> Xen vCPU id mapping */
-DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
-EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
-
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
@@ -112,12 +83,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
 }
 
-static int xen_vcpu_setup_restore(int cpu)
+static int xen_vcpu_setup_restore(xenhost_t *xh, int cpu)
 {
int rc = 0;
 
/* Any per_cpu(xen_vcpu) is stale, so reset it */
-   xen_vcpu_info_reset(cpu);
+   xen_vcpu_info_reset(xh, cpu);
 
/*
 * For PVH and PVHVM, setup online VCPUs only. The rest will
@@ -125,7 +96,7 @@ static int xen_vcpu_setup_restore(int cpu)
 */
if (xen_pv_domain() ||
(xen_hvm_domain() && cpu_online(cpu))) {
-   rc = xen_vcpu_setup(cpu);
+   rc = xen_vcpu_setup(xh, cpu);
}
 
return rc;
@@ -138,30 +109,42 @@ static int xen_vcpu_setup_restore(int cpu)
  */
 void xen_vcpu_restore(void)
 {
-   int cpu, rc;
+   int cpu, rc = 0;
 
+   /*
+* VCPU management is primarily the responsibility of xh_default and
+* xh_remote only needs VCPUOP_register_vcpu_info.
+* So, we do VPUOP_down and VCPUOP_up only on xh_default.
+*
+* (Currently, however, VCPUOP_register_vcpu_info is allowed only
+* on VCPUs that are self or down, so we might need a new model
+* there.)
+*/
for_each_possible_cpu(cpu) {
bool other_cpu = (cpu != smp_processor_id());
bool is_up;
+   xenhost_t **xh;
 
-   if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID)
+   if (xen_vcpu_nr(xh_default, cpu) == XEN_VCPU_ID_INVALID)
continue;
 
/* Only Xen 4.5 and higher support this. */

[RFC PATCH 08/16] x86/xen: irq/upcall handling with multiple xenhosts

2019-05-09 Thread Ankur Arora
For configurations with multiple xenhosts, we need to handle events
generated from multiple xenhosts.

Having more than one upcall handler might be quite hairy, and it would
be simpler if the callback from L0-Xen could be bounced via L1-Xen.
This will also mean simpler pv_irq_ops code because now the IF flag
maps onto the xh_default->vcpu_info->evtchn_upcall_mask.

However, we still update the xh_remote->vcpu_info->evtchn_upcall_mask
on a best effort basis to minimize unnecessary work in remote xenhost.

TODO:
  - direct pv_ops.irq are disabled.

Signed-off-by: Ankur Arora 
---
 arch/x86/xen/Makefile   |  2 +-
 arch/x86/xen/enlighten_pv.c |  4 ++-
 arch/x86/xen/irq.c  | 69 +
 arch/x86/xen/smp_pv.c   | 11 ++
 4 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 564b4dddbc15..3c7056ad3520 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_XEN_PV)  += enlighten_pv.o
 obj-$(CONFIG_XEN_PV)   += mmu_pv.o
 obj-$(CONFIG_XEN_PV)   += irq.o
 obj-$(CONFIG_XEN_PV)   += multicalls.o
-obj-$(CONFIG_XEN_PV)   += xen-asm.o
+obj-n  += xen-asm.o
 obj-$(CONFIG_XEN_PV)   += xen-asm_$(BITS).o
 
 obj-$(CONFIG_XEN_PVH)  += enlighten_pvh.o
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5f6a1475ec0c..77b1a0d4aef2 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -996,8 +996,9 @@ void __init xen_setup_vcpu_info_placement(void)
 * xen_vcpu_setup managed to place the vcpu_info within the
 * percpu area for all cpus, so make use of it.
 */
+#if 0
+   /* Disable direct access for now. */
if (xen_have_vcpu_info_placement && false) {
-   /* Disable direct access until we have proper pcpu data 
structures. */
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_ops.irq.restore_fl =
__PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
@@ -1007,6 +1008,7 @@ void __init xen_setup_vcpu_info_placement(void)
__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
}
+#endif
 }
 
 static const struct pv_info xen_info __initconst = {
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 38ad1a1c4763..f760a6abfb1e 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,9 +19,9 @@
  * callback mask. We do this in a very simple manner, by making a call
  * down into Xen. The pending flag will be checked by Xen on return.
  */
-void xen_force_evtchn_callback(void)
+void xen_force_evtchn_callback(xenhost_t *xh)
 {
-   (void)HYPERVISOR_xen_version(0, NULL);
+   (void)hypervisor_xen_version(xh, 0, NULL);
 }
 
 asmlinkage __visible unsigned long xen_save_fl(void)
@@ -29,6 +29,21 @@ asmlinkage __visible unsigned long xen_save_fl(void)
struct vcpu_info *vcpu;
unsigned long flags;
 
+   /*
+* In scenarios with more than one xenhost, the primary xenhost
+* is responsible for all the upcalls, with the remote xenhost
+* bouncing its upcalls through it (see comment in
+* cpu_initialize_context().)
+*
+* To minimize unnecessary upcalls, the remote xenhost still looks at
+* the value of vcpu_info->evtchn_upcall_mask, so we still set and reset
+* that.
+*
+* The fact that the upcall itself is gated by the default xenhost,
+* also helps in simplifying the logic here because we don't have to
+* worry about guaranteeing atomicity with updates to
+* xh_remote->vcpu_info->evtchn_upcall_mask.
+*/
vcpu = xh_default->xen_vcpu[smp_processor_id()];
 
/* flag has opposite sense of mask */
@@ -38,26 +53,34 @@ asmlinkage __visible unsigned long xen_save_fl(void)
   -0 -> 0x
   -1 -> 0x
*/
-   return (-flags) & X86_EFLAGS_IF;
+   return ((-flags) & X86_EFLAGS_IF);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 
 __visible void xen_restore_fl(unsigned long flags)
 {
struct vcpu_info *vcpu;
+   xenhost_t **xh;
 
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
 
/* See xen_irq_enable() for why preemption must be disabled. */
preempt_disable();
-   vcpu = xh_default->xen_vcpu[smp_processor_id()];
-   vcpu->evtchn_upcall_mask = flags;
+   for_each_xenhost(xh) {
+   vcpu = (*xh)->xen_vcpu[smp_processor_id()];
+   vcpu->evtchn_upcall_mask = flags;
+   }
 
if (flags == 0) {
barrier(); /* unmask then check (avoid races) */
-   if (unlikely(vcpu->evtchn_upcall_pending))
- 

[RFC PATCH 09/16] xen/evtchn: support evtchn in xenhost_t

2019-05-09 Thread Ankur Arora
Largely mechanical patch that adds a new param, xenhost_t * to the
evtchn interfaces. The evtchn port instead of being domain unique, is
now scoped to xenhost_t.

As part of upcall handling we now look at all the xenhosts and, for
evtchn_2l, the xenhost's shared_info and vcpu_info. Other than this
event handling is largley unchanged.

Note that the IPI, timer, VIRQ, FUNCTION, PMU etc vectors remain
attached to xh_default. Only interdomain evtchns are allowable as
xh_remote.

TODO:
  - to minimize the changes, evtchn FIFO is disabled for now.

Signed-off-by: Ankur Arora 
---
 arch/x86/pci/xen.c |  16 +-
 arch/x86/xen/enlighten_hvm.c   |   2 +-
 arch/x86/xen/irq.c |   2 +-
 arch/x86/xen/smp.c |  16 +-
 arch/x86/xen/smp_pv.c  |   4 +-
 arch/x86/xen/time.c|   5 +-
 arch/x86/xen/xen-ops.h |   1 +
 arch/x86/xen/xenhost.c |  16 +
 drivers/block/xen-blkback/xenbus.c |   2 +-
 drivers/block/xen-blkfront.c   |   2 +-
 drivers/input/misc/xen-kbdfront.c  |   2 +-
 drivers/net/xen-netback/interface.c|   8 +-
 drivers/net/xen-netfront.c |   6 +-
 drivers/pci/xen-pcifront.c |   2 +-
 drivers/xen/acpi.c |   2 +
 drivers/xen/balloon.c  |   2 +-
 drivers/xen/events/Makefile|   1 -
 drivers/xen/events/events_2l.c | 188 +-
 drivers/xen/events/events_base.c   | 379 -
 drivers/xen/events/events_fifo.c   |   2 +-
 drivers/xen/events/events_internal.h   |  78 ++---
 drivers/xen/evtchn.c   |  22 +-
 drivers/xen/fallback.c |   1 +
 drivers/xen/gntalloc.c |   8 +-
 drivers/xen/gntdev.c   |   8 +-
 drivers/xen/mcelog.c   |   2 +-
 drivers/xen/pcpu.c |   2 +-
 drivers/xen/preempt.c  |   1 +
 drivers/xen/privcmd.c  |   1 +
 drivers/xen/sys-hypervisor.c   |   2 +-
 drivers/xen/time.c |   2 +-
 drivers/xen/xen-pciback/xenbus.c   |   2 +-
 drivers/xen/xen-scsiback.c |   5 +-
 drivers/xen/xenbus/xenbus_client.c |   2 +-
 drivers/xen/xenbus/xenbus_comms.c  |   6 +-
 drivers/xen/xenbus/xenbus_probe.c  |   1 +
 drivers/xen/xenbus/xenbus_probe_backend.c  |   1 +
 drivers/xen/xenbus/xenbus_probe_frontend.c |   1 +
 drivers/xen/xenbus/xenbus_xs.c |   1 +
 include/xen/events.h   |  45 +--
 include/xen/xenhost.h  |  17 +
 41 files changed, 483 insertions(+), 383 deletions(-)

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d1a3b9f08289..9aa591b5fa3b 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -19,6 +19,8 @@
 #include 
 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -46,7 +48,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
if (gsi < nr_legacy_irqs())
share = 0;
 
-   rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+   rc = xen_bind_pirq_gsi_to_irq(xh_default, gsi, pirq, share, "pcifront");
if (rc < 0) {
dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to 
IRQ: %d\n",
 gsi, pirq, rc);
@@ -96,7 +98,7 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int 
triggering,
if (gsi_override >= 0)
gsi = gsi_override;
 
-   irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+   irq = xen_bind_pirq_gsi_to_irq(xh_default, gsi, map_irq.pirq, 
shareable, name);
if (irq < 0)
goto out;
 
@@ -180,7 +182,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int 
nvec, int type)
goto error;
i = 0;
for_each_pci_msi_entry(msidesc, dev) {
-   irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
+   irq = xen_bind_pirq_msi_to_irq(xh_default, dev, msidesc, v[i],
   (type == PCI_CAP_ID_MSI) ? nvec 
: 1,
   (type == PCI_CAP_ID_MSIX) ?
   "pcifront-msi-x" :
@@ -234,7 +236,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int 
nvec, int type)
return 1;
 
for_each_pci_msi_entry(msidesc, dev) {
-   pirq = xen_allocate_pirq_msi(dev, msidesc);
+   pirq = xen_allocate_pirq_msi(xh_default, dev, msidesc);
if (pirq < 0) {
irq = -ENODEV;
goto error;
@@ -242,7 +244,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int 

[RFC PATCH 12/16] xen/xenbus: support xenbus frontend/backend with xenhost_t

2019-05-09 Thread Ankur Arora
As part of xenbus init, both frontend, backend interfaces need to talk
on the correct xenbus. This might be a local xenstore (backend) or might
be a XS_PV/XS_HVM interface (frontend) which needs to talk over xenbus
with the remote xenstored. We bootstrap all of these with evtchn/gfn
parameters from (*setup_xs)().

Given this we can do appropriate device discovery (in case of frontend)
and device connectivity for the backend.
Once done, we stash the xenhost_t * in xen_bus_type, xenbus_device or
xenbus_watch and then the frontend and backend devices implicitly use
the correct interface.

The rest of patch is just changing the interfaces where needed.

Signed-off-by: Ankur Arora 
---
 drivers/block/xen-blkback/blkback.c|  10 +-
 drivers/net/xen-netfront.c |  14 +-
 drivers/pci/xen-pcifront.c |   4 +-
 drivers/xen/cpu_hotplug.c  |   4 +-
 drivers/xen/manage.c   |  28 +--
 drivers/xen/xen-balloon.c  |   8 +-
 drivers/xen/xenbus/xenbus.h|  45 ++--
 drivers/xen/xenbus/xenbus_client.c |  32 +--
 drivers/xen/xenbus/xenbus_comms.c  | 121 +-
 drivers/xen/xenbus/xenbus_dev_backend.c|  30 ++-
 drivers/xen/xenbus/xenbus_dev_frontend.c   |  22 +-
 drivers/xen/xenbus/xenbus_probe.c  | 246 +
 drivers/xen/xenbus/xenbus_probe_backend.c  |  19 +-
 drivers/xen/xenbus/xenbus_probe_frontend.c |  65 +++---
 drivers/xen/xenbus/xenbus_xs.c | 188 +---
 include/xen/xen-ops.h  |   3 +
 include/xen/xenbus.h   |  54 +++--
 include/xen/xenhost.h  |  20 ++
 18 files changed, 536 insertions(+), 377 deletions(-)

diff --git a/drivers/block/xen-blkback/blkback.c 
b/drivers/block/xen-blkback/blkback.c
index fd1e19f1a49f..7ad4423c24b8 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -541,12 +541,12 @@ static void xen_vbd_resize(struct xen_blkif *blkif)
pr_info("VBD Resize: new size %llu\n", new_size);
vbd->size = new_size;
 again:
-   err = xenbus_transaction_start(&xbt);
+   err = xenbus_transaction_start(dev->xh, &xbt);
if (err) {
pr_warn("Error starting transaction\n");
return;
}
-   err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+   err = xenbus_printf(dev->xh, xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(vbd));
if (err) {
pr_warn("Error writing new size\n");
@@ -557,20 +557,20 @@ static void xen_vbd_resize(struct xen_blkif *blkif)
 * the front-end. If the current state is "connected" the
 * front-end will get the new size information online.
 */
-   err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+   err = xenbus_printf(dev->xh, xbt, dev->nodename, "state", "%d", 
dev->state);
if (err) {
pr_warn("Error writing the state\n");
goto abort;
}
 
-   err = xenbus_transaction_end(xbt, 0);
+   err = xenbus_transaction_end(dev->xh, xbt, 0);
if (err == -EAGAIN)
goto again;
if (err)
pr_warn("Error ending transaction\n");
return;
 abort:
-   xenbus_transaction_end(xbt, 1);
+   xenbus_transaction_end(dev->xh, xbt, 1);
 }
 
 /*
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 1cd0a2d2ba54..ee28e8b85406 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1336,9 +1336,9 @@ static struct net_device *xennet_create_dev(struct 
xenbus_device *dev)
 
xenbus_switch_state(dev, XenbusStateInitialising);
wait_event(module_wq,
-  xenbus_read_driver_state(dev->otherend) !=
+  xenbus_read_driver_state(dev, dev->otherend) !=
   XenbusStateClosed &&
-  xenbus_read_driver_state(dev->otherend) !=
+  xenbus_read_driver_state(dev, dev->otherend) !=
   XenbusStateUnknown);
return netdev;
 
@@ -2145,19 +2145,19 @@ static int xennet_remove(struct xenbus_device *dev)
 
dev_dbg(&dev->dev, "%s\n", dev->nodename);
 
-   if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) {
+   if (xenbus_read_driver_state(dev, dev->otherend) != XenbusStateClosed) {
xenbus_switch_state(dev, XenbusStateClosing);
wait_event(module_wq,
-  xenbus_read_driver_state(dev->otherend) ==
+  xenbus_read_driver_state(dev, dev->otherend) ==
   XenbusStateClosi

[RFC PATCH 06/16] x86/xen: add shared_info support to xenhost_t

2019-05-09 Thread Ankur Arora
HYPERVISOR_shared_info is used for irq/evtchn communication between the
guest and the host. Abstract out the setup/reset in xenhost_t such that
nested configurations can use both xenhosts simultaneously.

In addition to irq/evtchn communication, shared_info is also used for
pvclock and p2m related state. For both of those, remote xenhost is not
of interest so we only use the default xenhost.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/xen/hypervisor.h |  1 -
 arch/x86/xen/enlighten.c  | 10 ++-
 arch/x86/xen/enlighten_hvm.c  | 38 +-
 arch/x86/xen/enlighten_pv.c   | 28 ---
 arch/x86/xen/p2m.c| 24 -
 arch/x86/xen/suspend_hvm.c|  6 -
 arch/x86/xen/suspend_pv.c | 14 +-
 arch/x86/xen/time.c   |  4 +--
 arch/x86/xen/xen-ops.h|  2 --
 arch/x86/xen/xenhost.c| 13 -
 drivers/xen/events/events_2l.c| 16 +--
 include/xen/xenhost.h | 39 +++
 12 files changed, 138 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypervisor.h 
b/arch/x86/include/asm/xen/hypervisor.h
index 6c4cdcdf997d..3e6bd455fbd0 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -33,7 +33,6 @@
 #ifndef _ASM_X86_XEN_HYPERVISOR_H
 #define _ASM_X86_XEN_HYPERVISOR_H
 
-extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
 #include 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f88bb14da3f2..20e0de82 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -72,12 +72,6 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 uint32_t xen_start_flags __attribute__((section(".data"))) = 0;
 EXPORT_SYMBOL(xen_start_flags);
 
-/*
- * Point at some empty memory to start with. We map the real shared_info
- * page as soon as fixmap is up and running.
- */
-struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
-
 /*
  * Flag to determine whether vcpu info placement is available on all
  * VCPUs.  We assume it is to start with, and then set it to zero on
@@ -187,7 +181,7 @@ void xen_vcpu_info_reset(int cpu)
 {
if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
per_cpu(xen_vcpu, cpu) =
-   &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+   
&xh_default->HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
} else {
/* Set to NULL so that if somebody accesses it we get an OOPS */
per_cpu(xen_vcpu, cpu) = NULL;
@@ -200,7 +194,7 @@ int xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
 
-   BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+   BUG_ON(xh_default->HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
/*
 * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index a118b61a1a8a..0e53363f9d1f 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -26,21 +26,25 @@
 #include "mmu.h"
 #include "smp.h"
 
-static unsigned long shared_info_pfn;
-
-void xen_hvm_init_shared_info(void)
+static void xen_hvm_init_shared_info(xenhost_t *xh)
 {
struct xen_add_to_physmap xatp;
 
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
-   xatp.gpfn = shared_info_pfn;
-   if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+   xatp.gpfn = xh->shared_info_pfn;
+   if (hypervisor_memory_op(xh, XENMEM_add_to_physmap, &xatp))
BUG();
 }
 
-static void __init reserve_shared_info(void)
+static void xen_hvm_reset_shared_info(xenhost_t *xh)
+{
+   early_memunmap(xh->HYPERVISOR_shared_info, PAGE_SIZE);
+   xh->HYPERVISOR_shared_info = __va(PFN_PHYS(xh->shared_info_pfn));
+}
+
+static void __init reserve_shared_info(xenhost_t *xh)
 {
u64 pa;
 
@@ -58,16 +62,18 @@ static void __init reserve_shared_info(void)
 pa += PAGE_SIZE)
;
 
-   shared_info_pfn = PHYS_PFN(pa);
+   xh->shared_info_pfn = PHYS_PFN(pa);
 
memblock_reserve(pa, PAGE_SIZE);
-   HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
+   xh->HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
 }
 
 static void __init xen_hvm_init_mem_mapping(void)
 {
-   early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
-   HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+   xenhost_t **xh;
+
+   for_each_xenhost(xh)
+   xenhost_reset_shared_info(*xh);
 
/*
 * The virtual address of the shared_info page has changed, so
@@ -79,6 +85,7 @@ static void __

[RFC PATCH 01/16] x86/xen: add xenhost_t interface

2019-05-09 Thread Ankur Arora
Add xenhost_t which will serve as an abstraction over Xen interfaces.
It co-exists with the PV/HVM/PVH abstractions (x86_init, hypervisor_x86,
pv_ops etc) and is meant to capture mechanisms for communication with
Xen so we could have different types of underlying Xen: regular, local,
and nested.

Also add xenhost_register() and stub registration in the various guest
types.

Signed-off-by: Ankur Arora 
---
 arch/x86/xen/Makefile|  1 +
 arch/x86/xen/enlighten_hvm.c | 13 +
 arch/x86/xen/enlighten_pv.c  | 16 ++
 arch/x86/xen/enlighten_pvh.c | 12 +
 arch/x86/xen/xenhost.c   | 75 
 include/xen/xen.h|  3 ++
 include/xen/xenhost.h| 95 
 7 files changed, 215 insertions(+)
 create mode 100644 arch/x86/xen/xenhost.c
 create mode 100644 include/xen/xenhost.h

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 084de77a109e..564b4dddbc15 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -18,6 +18,7 @@ obj-y += mmu.o
 obj-y  += time.o
 obj-y  += grant-table.o
 obj-y  += suspend.o
+obj-y  += xenhost.o
 
 obj-$(CONFIG_XEN_PVHVM)+= enlighten_hvm.o
 obj-$(CONFIG_XEN_PVHVM)+= mmu_hvm.o
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 0e75642d42a3..100452f4f44c 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -82,6 +83,12 @@ static void __init xen_hvm_init_mem_mapping(void)
xen_vcpu_info_reset(0);
 }
 
+xenhost_ops_t xh_hvm_ops = {
+};
+
+xenhost_ops_t xh_hvm_nested_ops = {
+};
+
 static void __init init_hvm_pv_info(void)
 {
int major, minor;
@@ -179,6 +186,12 @@ static void __init xen_hvm_guest_init(void)
 {
if (xen_pv_domain())
return;
+   /*
+* We need only xenhost_r1 for HVM guests since they cannot be
+* driver domain (?) or dom0.
+*/
+   if (!xen_pvh_domain())
+   xenhost_register(xenhost_r1, &xh_hvm_ops);
 
init_hvm_pv_info();
 
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c54a493e139a..bb6e811c1525 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -36,6 +36,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1188,6 +1189,12 @@ static void __init xen_dom0_set_legacy_features(void)
x86_platform.legacy.rtc = 1;
 }
 
+xenhost_ops_t xh_pv_ops = {
+};
+
+xenhost_ops_t xh_pv_nested_ops = {
+};
+
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
@@ -1198,6 +1205,15 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (!xen_start_info)
return;
 
+   xenhost_register(xenhost_r1, &xh_pv_ops);
+
+   /*
+* Detect in some implementation defined manner whether this is
+* nested or not.
+*/
+   if (xen_driver_domain() && xen_nested())
+   xenhost_register(xenhost_r2, &xh_pv_nested_ops);
+
xen_domain_type = XEN_PV_DOMAIN;
xen_start_flags = xen_start_info->flags;
 
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 35b7599d2d0b..826c296d27a3 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -8,6 +8,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -21,11 +22,22 @@
  */
 bool xen_pvh __attribute__((section(".data"))) = 0;
 
+extern xenhost_ops_t xh_hvm_ops, xh_hvm_nested_ops;
+
 void __init xen_pvh_init(void)
 {
u32 msr;
u64 pfn;
 
+   xenhost_register(xenhost_r1, &xh_hvm_ops);
+
+   /*
+* Detect in some implementation defined manner whether this is
+* nested or not.
+*/
+   if (xen_driver_domain() && xen_nested())
+   xenhost_register(xenhost_r2, &xh_hvm_nested_ops);
+
xen_pvh = 1;
xen_start_flags = pvh_start_info.flags;
 
diff --git a/arch/x86/xen/xenhost.c b/arch/x86/xen/xenhost.c
new file mode 100644
index ..ca90acd7687e
--- /dev/null
+++ b/arch/x86/xen/xenhost.c
@@ -0,0 +1,75 @@
+#include 
+#include 
+#include 
+#include 
+
+xenhost_t xenhosts[2];
+/*
+ * xh_default: interface to the regular hypervisor. xenhost_type is xenhost_r0
+ * or xenhost_r1.
+ *
+ * xh_remote: interface to remote hypervisor. Needed for PV driver support on
+ * L1-dom0/driver-domain for nested Xen. xenhost_type is xenhost_r2.
+ */
+xenhost_t *xh_default = (xenhost_t *) &xenhosts[0];
+xenhost_t *xh_remote = (xenhost_t *) &xenhosts[1];
+
+/*
+ * Exported for use of for_each_xenhost().
+ */
+EXPORT_SYMBOL_GPL(xenhosts);
+
+/*
+ * Some places refer directly to a specific ty

[RFC PATCH 11/16] xen/grant-table: make grant-table xenhost aware

2019-05-09 Thread Ankur Arora
Largely mechanical changes: the exported grant table symbols now take
xenhost_t * as a parameter. Also, move the grant table global state
inside xenhost_t.

If there's more than one xenhost, then initialize both.

Signed-off-by: Ankur Arora 
---
 arch/x86/xen/grant-table.c |  71 +++--
 drivers/xen/grant-table.c  | 611 +
 include/xen/grant_table.h  |  72 ++---
 include/xen/xenhost.h  |  11 +
 4 files changed, 443 insertions(+), 322 deletions(-)

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index ecb0d5450334..8f4b071427f9 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -23,48 +23,54 @@
 
 #include 
 
-static struct gnttab_vm_area {
+struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
-} gnttab_shared_vm_area, gnttab_status_vm_area;
+};
 
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
-  unsigned long max_nr_gframes,
-  void **__shared)
+int arch_gnttab_map_shared(xenhost_t *xh, unsigned long *frames,
+   unsigned long nr_gframes,
+   unsigned long max_nr_gframes,
+   void **__shared)
 {
void *shared = *__shared;
unsigned long addr;
unsigned long i;
 
if (shared == NULL)
-   *__shared = shared = gnttab_shared_vm_area.area->addr;
+   *__shared = shared = ((struct gnttab_vm_area *)
+   xh->gnttab_shared_vm_area)->area->addr;
 
addr = (unsigned long)shared;
 
for (i = 0; i < nr_gframes; i++) {
-   set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
-  mfn_pte(frames[i], PAGE_KERNEL));
+   set_pte_at(&init_mm, addr,
+   ((struct gnttab_vm_area *) 
xh->gnttab_shared_vm_area)->ptes[i],
+   mfn_pte(frames[i], PAGE_KERNEL));
addr += PAGE_SIZE;
}
 
return 0;
 }
 
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
-  unsigned long max_nr_gframes,
-  grant_status_t **__shared)
+int arch_gnttab_map_status(xenhost_t *xh, uint64_t *frames,
+   unsigned long nr_gframes,
+   unsigned long max_nr_gframes,
+   grant_status_t **__shared)
 {
grant_status_t *shared = *__shared;
unsigned long addr;
unsigned long i;
 
if (shared == NULL)
-   *__shared = shared = gnttab_status_vm_area.area->addr;
+   *__shared = shared = ((struct gnttab_vm_area *)
+   xh->gnttab_status_vm_area)->area->addr;
 
addr = (unsigned long)shared;
 
for (i = 0; i < nr_gframes; i++) {
-   set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+   set_pte_at(&init_mm, addr, ((struct gnttab_vm_area *)
+   xh->gnttab_status_vm_area)->ptes[i],
   mfn_pte(frames[i], PAGE_KERNEL));
addr += PAGE_SIZE;
}
@@ -72,16 +78,17 @@ int arch_gnttab_map_status(uint64_t *frames, unsigned long 
nr_gframes,
return 0;
 }
 
-void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+void arch_gnttab_unmap(xenhost_t *xh, void *shared, unsigned long nr_gframes)
 {
pte_t **ptes;
unsigned long addr;
unsigned long i;
 
-   if (shared == gnttab_status_vm_area.area->addr)
-   ptes = gnttab_status_vm_area.ptes;
+   if (shared == ((struct gnttab_vm_area *)
+   xh->gnttab_status_vm_area)->area->addr)
+   ptes = ((struct gnttab_vm_area *) 
xh->gnttab_status_vm_area)->ptes;
else
-   ptes = gnttab_shared_vm_area.ptes;
+   ptes = ((struct gnttab_vm_area *) 
xh->gnttab_shared_vm_area)->ptes;
 
addr = (unsigned long)shared;
 
@@ -112,14 +119,15 @@ static void arch_gnttab_vfree(struct gnttab_vm_area *area)
kfree(area->ptes);
 }
 
-int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+int arch_gnttab_init(xenhost_t *xh, unsigned long nr_shared, unsigned long 
nr_status)
 {
int ret;
 
if (!xen_pv_domain())
return 0;
 
-   ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+   ret = arch_gnttab_valloc((struct gnttab_vm_area *)
+   xh->gnttab_shared_vm_area, nr_shared);
if (ret < 0)
return ret;
 
@@ -127,13 +135,15 @@ int arch_gnttab_init(unsigned long nr_shared, unsigned 
long nr_status)
 * Always allocate the space for the status frames in case
 * we're migrated 

[RFC PATCH 04/16] x86/xen: hypercall support for xenhost_t

2019-05-09 Thread Ankur Arora
Allow for different hypercall implementations for different xenhost types.
Nested xenhost, which has two underlying xenhosts, can use both
simultaneously.

The hypercall macros (HYPERVISOR_*) implicitly use the default xenhost.x
A new macro (hypervisor_*) takes xenhost_t * as a parameter and does the
right thing.

TODO:
  - Multicalls for now assume the default xenhost
  - xen_hypercall_* symbols are only generated for the default xenhost.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/xen/hypercall.h | 233 ++-
 arch/x86/xen/enlighten.c |   3 -
 arch/x86/xen/enlighten_hvm.c |  23 ++-
 arch/x86/xen/enlighten_pv.c  |  13 +-
 arch/x86/xen/enlighten_pvh.c |   9 +-
 arch/x86/xen/xen-head.S  |   3 +
 drivers/xen/fallback.c   |   8 +-
 include/xen/xenhost.h|  23 +++
 8 files changed, 218 insertions(+), 97 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypercall.h 
b/arch/x86/include/asm/xen/hypercall.h
index 1a3cd6680e6f..e138f9c36a5a 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct xen_dm_op_buf;
 
@@ -88,11 +89,11 @@ struct xen_dm_op_buf;
 
 struct hypercall_entry { char _entry[32]; };
 extern struct hypercall_entry xen_hypercall_page[128];
-extern struct hypercall_entry *hypercall_page;
+extern struct hypercall_entry xen_hypercall_page2[128];
 
 #define __HYPERCALLCALL_NOSPEC
-#define __HYPERCALL_ENTRY(x)   \
-   [thunk_target] "0" (hypercall_page + __HYPERVISOR_##x)
+#define __HYPERCALL_ENTRY(xh, x)   
\
+   [thunk_target] "0" (xh->hypercall_page + __HYPERVISOR_##x)
 
 #ifdef CONFIG_X86_32
 #define __HYPERCALL_RETREG "eax"
@@ -144,57 +145,57 @@ extern struct hypercall_entry *hypercall_page;
 #define __HYPERCALL_CLOBBER1   __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
 #define __HYPERCALL_CLOBBER0   __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
 
-#define _hypercall0(type, name)
\
+#define _hypercall0(xh, type, name)\
 ({ \
__HYPERCALL_DECLS;  \
__HYPERCALL_0ARG(); \
asm volatile (__HYPERCALL   \
  : __HYPERCALL_0PARAM  \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(xh, name) \
  : __HYPERCALL_CLOBBER0);  \
(type)__res;\
 })
 
-#define _hypercall1(type, name, a1)\
+#define _hypercall1(xh, type, name, a1)
\
 ({ \
__HYPERCALL_DECLS;  \
__HYPERCALL_1ARG(a1);   \
asm volatile (__HYPERCALL   \
  : __HYPERCALL_1PARAM  \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(xh, name) \
  : __HYPERCALL_CLOBBER1);  \
(type)__res;\
 })
 
-#define _hypercall2(type, name, a1, a2)
\
+#define _hypercall2(xh, type, name, a1, a2)\
 ({ \
__HYPERCALL_DECLS;  \
__HYPERCALL_2ARG(a1, a2);   \
asm volatile (__HYPERCALL   \
  : __HYPERCALL_2PARAM  \
- : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_ENTRY(xh, name) \
  : __HYPERCALL_CLOBBER2);  \
(type)__res;\
 })
 
-#define _hypercall3(type, name, a1, a2, a3)\
+#define _hypercall3(xh, type, name, a1, a2, a3)
\
 ({ \
__HYPERCALL_DECLS;  \
__HYPERCALL_3ARG(a1, a2, a3);   \

[RFC PATCH 05/16] x86/xen: add feature support in xenhost_t

2019-05-09 Thread Ankur Arora
With nested xenhosts, both the xenhosts could have different supported
xen_features. Add support for probing both.

In addition, validate that features are compatible across xenhosts.

For runtime feature checking, the code uses xen_feature() with the
default xenhost. This should be good enough because we do feature
validation early which guarantees that the features of interest are
compatible. Features not of interest, are related to MMU, clock, pirq, etc where
the interface to L0-Xen should not matter.

Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten_hvm.c | 15 +++
 arch/x86/xen/enlighten_pv.c  | 14 ++
 drivers/xen/features.c   | 33 +++--
 include/xen/features.h   | 17 ++---
 include/xen/xenhost.h| 10 ++
 5 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index f84941d6944e..a118b61a1a8a 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -119,17 +119,24 @@ static void __init init_hvm_pv_info(void)
 
xen_domain_type = XEN_HVM_DOMAIN;
 
-   /* PVH set up hypercall page in xen_prepare_pvh(). */
if (xen_pvh_domain())
pv_info.name = "Xen PVH";
-   else {
+   else
pv_info.name = "Xen HVM";
 
-   for_each_xenhost(xh)
+   for_each_xenhost(xh) {
+   /* PVH set up hypercall page in xen_prepare_pvh(). */
+   if (!xen_pvh_domain())
xenhost_setup_hypercall_page(*xh);
+   xen_setup_features(*xh);
}
 
-   xen_setup_features();
+   /*
+* Check if features are compatible across L1-Xen and L0-Xen;
+* If not, get rid of xenhost_r2.
+*/
+   if (xen_validate_features() == false)
+   __xenhost_unregister(xenhost_r2);
 
cpuid(base + 4, &eax, &ebx, &ecx, &edx);
if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT)
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index a2c07cc71498..484968ff16a4 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1236,13 +1236,19 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (xen_driver_domain() && xen_nested())
xenhost_register(xenhost_r2, &xh_pv_nested_ops);
 
-   for_each_xenhost(xh)
-   xenhost_setup_hypercall_page(*xh);
-
xen_domain_type = XEN_PV_DOMAIN;
xen_start_flags = xen_start_info->flags;
 
-   xen_setup_features();
+   for_each_xenhost(xh) {
+   xenhost_setup_hypercall_page(*xh);
+   xen_setup_features(*xh);
+   }
+   /*
+* Check if features are compatible across L1-Xen and L0-Xen;
+* If not, get rid of xenhost_r2.
+*/
+   if (xen_validate_features() == false)
+   __xenhost_unregister(xenhost_r2);
 
/* Install Xen paravirt ops */
pv_info = xen_info;
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
index d7d34fdfc993..b4fba808ebae 100644
--- a/drivers/xen/features.c
+++ b/drivers/xen/features.c
@@ -15,19 +15,40 @@
 #include 
 #include 
 
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-
-void xen_setup_features(void)
+void xen_setup_features(xenhost_t *xh)
 {
struct xen_feature_info fi;
int i, j;
 
for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
fi.submap_idx = i;
-   if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+   if (hypervisor_xen_version(xh, XENVER_get_features, &fi) < 0)
break;
for (j = 0; j < 32; j++)
-   xen_features[i * 32 + j] = !!(fi.submap & 1<features[i * 32 + j] = !!(fi.submap & 1<features and xh_remote->features for
+* compatibility. Relevant features should be compatible
+* or we are asking for trouble.
+*/
+   fail += __xen_feature(xh_default, 
XENFEAT_auto_translated_physmap) !=
+   __xen_feature(xh_remote, 
XENFEAT_auto_translated_physmap);
+
+   /* We would like callbacks via hvm_callback_vector. */
+   fail += __xen_feature(xh_default, XENFEAT_hvm_callback_vector) 
== 0;
+   fail += __xen_feature(xh_remote, XENFEAT_hvm_callback_vector) 
== 0;
+
+   if (fail)
+   return false;
+   }
+
+   return fail ? false : true;
+}
diff --git a/include/xen/features.h b/include/xen/features.h
index e4cb464386a9..63e6735ed6a3 100644
--- a/include/xen/features.h
+++ b/include/xen/features.h
@@ -11,14 +11,25 @@
 #define __XEN_FEATURES_H__
 
 #include 
+#include 
 
-void xen_setup_features(void);
+void xen_setup_features(xenhost_t *xh);
 
-extern u8 xe

Re: [RFC PATCH 10/16] xen/balloon: support ballooning in xenhost_t

2019-06-18 Thread Ankur Arora




On 6/17/19 2:28 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Xen ballooning uses hollow struct pages (with the underlying GFNs being
populated/unpopulated via hypercalls) which are used by the grant logic
to map grants from other domains.

This patch allows the default xenhost to provide an alternate ballooning
allocation mechanism. This is expected to be useful for local xenhosts
(type xenhost_r0) because unlike Xen, where there is an external
hypervisor which can change the memory underneath a GFN, that is not
possible when the hypervisor is running in the same address space
as the entity doing the ballooning.

Co-developed-by: Ankur Arora 
Signed-off-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
  arch/x86/xen/enlighten_hvm.c   |  7 +++
  arch/x86/xen/enlighten_pv.c    |  8 
  drivers/xen/balloon.c  | 19 ---
  drivers/xen/grant-table.c  |  4 ++--
  drivers/xen/privcmd.c  |  4 ++--
  drivers/xen/xen-selfballoon.c  |  2 ++
  drivers/xen/xenbus/xenbus_client.c |  6 +++---
  drivers/xen/xlate_mmu.c    |  4 ++--
  include/xen/balloon.h  |  4 ++--
  include/xen/xenhost.h  | 19 +++
  10 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 5ef4d6ad920d..08becf574743 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -63,6 +63,7 @@
  #include 
  #include 
+#include 
  #include 
  #include 
@@ -583,12 +584,21 @@ static int add_ballooned_pages(int nr_pages)
   * @pages: pages returned
   * @return 0 on success, error otherwise
   */
-int alloc_xenballooned_pages(int nr_pages, struct page **pages)
+int alloc_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page 
**pages)

  {
  int pgno = 0;
  struct page *page;
  int ret;
+    /*
+ * xenmem transactions for remote xenhost are disallowed.
+ */
+    if (xh->type == xenhost_r2)
+    return -EINVAL;


Why don't you set a dummy function returning -EINVAL into the xenhost_r2
structure instead?

Will do. And, same for the two comments below.

Ankur




+
+    if (xh->ops->alloc_ballooned_pages)
+    return xh->ops->alloc_ballooned_pages(xh, nr_pages, pages);
+


Please make alloc_xenballooned_pages() an inline wrapper and use the
current implmentaion as the default. This avoids another if ().


  mutex_lock(&balloon_mutex);
  balloon_stats.target_unpopulated += nr_pages;
@@ -620,7 +630,7 @@ int alloc_xenballooned_pages(int nr_pages, struct 
page **pages)

  return 0;
   out_undo:
  mutex_unlock(&balloon_mutex);
-    free_xenballooned_pages(pgno, pages);
+    free_xenballooned_pages(xh, pgno, pages);
  return ret;
  }
  EXPORT_SYMBOL(alloc_xenballooned_pages);
@@ -630,10 +640,13 @@ EXPORT_SYMBOL(alloc_xenballooned_pages);
   * @nr_pages: Number of pages
   * @pages: pages to return
   */
-void free_xenballooned_pages(int nr_pages, struct page **pages)
+void free_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page 
**pages)

  {
  int i;
+    if (xh->ops->free_ballooned_pages)
+    return xh->ops->free_ballooned_pages(xh, nr_pages, pages);
+


Same again: please use an inline wrapper.


Juergen


Re: [RFC PATCH 11/16] xen/grant-table: make grant-table xenhost aware

2019-06-18 Thread Ankur Arora

On 6/17/19 2:36 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Largely mechanical changes: the exported grant table symbols now take
xenhost_t * as a parameter. Also, move the grant table global state
inside xenhost_t.

If there's more than one xenhost, then initialize both.

Signed-off-by: Ankur Arora 
---
  arch/x86/xen/grant-table.c |  71 +++--
  drivers/xen/grant-table.c  | 611 +
  include/xen/grant_table.h  |  72 ++---
  include/xen/xenhost.h  |  11 +
  4 files changed, 443 insertions(+), 322 deletions(-)

diff --git a/include/xen/xenhost.h b/include/xen/xenhost.h
index 9e08627a9e3e..acee0c7872b6 100644
--- a/include/xen/xenhost.h
+++ b/include/xen/xenhost.h
@@ -129,6 +129,17 @@ typedef struct {
  const struct evtchn_ops *evtchn_ops;
  int **evtchn_to_irq;
  };
+
+    /* grant table private state */
+    struct {
+    /* private to drivers/xen/grant-table.c */
+    void *gnttab_private;
+
+    /* x86/xen/grant-table.c */
+    void *gnttab_shared_vm_area;
+    void *gnttab_status_vm_area;
+    void *auto_xlat_grant_frames;


Please use proper types here instead of void *. This avoids lots of
casts. It is okay to just add anonymous struct definitions and keep the
real struct layout local to grant table code.

Will fix.

Ankur




Juergen


Re: [RFC PATCH 12/16] xen/xenbus: support xenbus frontend/backend with xenhost_t

2019-06-18 Thread Ankur Arora




On 6/17/19 2:50 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

As part of xenbus init, both frontend, backend interfaces need to talk
on the correct xenbus. This might be a local xenstore (backend) or might
be a XS_PV/XS_HVM interface (frontend) which needs to talk over xenbus
with the remote xenstored. We bootstrap all of these with evtchn/gfn
parameters from (*setup_xs)().

Given this we can do appropriate device discovery (in case of frontend)
and device connectivity for the backend.
Once done, we stash the xenhost_t * in xen_bus_type, xenbus_device or
xenbus_watch and then the frontend and backend devices implicitly use
the correct interface.

The rest of patch is just changing the interfaces where needed.

Signed-off-by: Ankur Arora 
---
  drivers/block/xen-blkback/blkback.c    |  10 +-
  drivers/net/xen-netfront.c |  14 +-
  drivers/pci/xen-pcifront.c |   4 +-
  drivers/xen/cpu_hotplug.c  |   4 +-
  drivers/xen/manage.c   |  28 +--
  drivers/xen/xen-balloon.c  |   8 +-
  drivers/xen/xenbus/xenbus.h    |  45 ++--
  drivers/xen/xenbus/xenbus_client.c |  32 +--
  drivers/xen/xenbus/xenbus_comms.c  | 121 +-
  drivers/xen/xenbus/xenbus_dev_backend.c    |  30 ++-
  drivers/xen/xenbus/xenbus_dev_frontend.c   |  22 +-
  drivers/xen/xenbus/xenbus_probe.c  | 246 +
  drivers/xen/xenbus/xenbus_probe_backend.c  |  19 +-
  drivers/xen/xenbus/xenbus_probe_frontend.c |  65 +++---
  drivers/xen/xenbus/xenbus_xs.c | 188 +---
  include/xen/xen-ops.h  |   3 +
  include/xen/xenbus.h   |  54 +++--
  include/xen/xenhost.h  |  20 ++
  18 files changed, 536 insertions(+), 377 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c 
b/drivers/xen/xenbus/xenbus_dev_frontend.c

index c3e201025ef0..d6e0c397c6a0 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -58,10 +58,14 @@
  #include 
  #include 
+#include 
+#include 
  #include 
  #include "xenbus.h"
+static xenhost_t *xh;
+
  /*
   * An element of a list of outstanding transactions, for which we're
   * still waiting a reply.
@@ -312,13 +316,13 @@ static void xenbus_file_free(struct kref *kref)
   */
  list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
-    xenbus_transaction_end(trans->handle, 1);
+    xenbus_transaction_end(xh, trans->handle, 1);
  list_del(&trans->list);
  kfree(trans);
  }
  list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
-    unregister_xenbus_watch(&watch->watch);
+    unregister_xenbus_watch(xh, &watch->watch);
  list_del(&watch->list);
  free_watch_adapter(watch);
  }
@@ -450,7 +454,7 @@ static int xenbus_write_transaction(unsigned 
msg_type,

 (!strcmp(msg->body, "T") || !strcmp(msg->body, "F"
  return xenbus_command_reply(u, XS_ERROR, "EINVAL");
-    rc = xenbus_dev_request_and_reply(&msg->hdr, u);
+    rc = xenbus_dev_request_and_reply(xh, &msg->hdr, u);
  if (rc && trans) {
  list_del(&trans->list);
  kfree(trans);
@@ -489,7 +493,7 @@ static int xenbus_write_watch(unsigned msg_type, 
struct xenbus_file_priv *u)

  watch->watch.callback = watch_fired;
  watch->dev_data = u;
-    err = register_xenbus_watch(&watch->watch);
+    err = register_xenbus_watch(xh, &watch->watch);
  if (err) {
  free_watch_adapter(watch);
  rc = err;
@@ -500,7 +504,7 @@ static int xenbus_write_watch(unsigned msg_type, 
struct xenbus_file_priv *u)

  list_for_each_entry(watch, &u->watches, list) {
  if (!strcmp(watch->token, token) &&
  !strcmp(watch->watch.node, path)) {
-    unregister_xenbus_watch(&watch->watch);
+    unregister_xenbus_watch(xh, &watch->watch);
  list_del(&watch->list);
  free_watch_adapter(watch);
  break;
@@ -618,8 +622,9 @@ static ssize_t xenbus_file_write(struct file *filp,
  static int xenbus_file_open(struct inode *inode, struct file *filp)
  {
  struct xenbus_file_priv *u;
+    struct xenstore_private *xs = xs_priv(xh);
-    if (xen_store_evtchn == 0)
+    if (xs->store_evtchn == 0)
  return -ENOENT;
  nonseekable_open(inode, filp);
@@ -687,6 +692,11 @@ static int __init xenbus_init(void)
  if (!xen_domain())
  return -ENODEV;
+    if (xen_driver_domain() && xen_nested())
+    xh = xh_remote;
+    else
+    xh = xh_default;


This precludes any mixed use of L0 and L1 frontends. With this 

Re: [RFC PATCH 13/16] drivers/xen: gnttab, evtchn, xenbus API changes

2019-06-18 Thread Ankur Arora

On 6/17/19 3:07 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Mechanical changes, now most of these calls take xenhost_t *
as parameter.

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 
---
  drivers/xen/cpu_hotplug.c | 14 ++---
  drivers/xen/gntalloc.c    | 13 
  drivers/xen/gntdev.c  | 16 +++
  drivers/xen/manage.c  | 37 ++-
  drivers/xen/platform-pci.c    | 12 +++-
  drivers/xen/sys-hypervisor.c  | 12 
  drivers/xen/xen-balloon.c | 10 +++---
  drivers/xen/xenfs/xenstored.c |  7 ---
  8 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index afeb94446d34..4a05bc028956 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -31,13 +31,13 @@ static void disable_hotplug_cpu(int cpu)
  unlock_device_hotplug();
  }
-static int vcpu_online(unsigned int cpu)
+static int vcpu_online(xenhost_t *xh, unsigned int cpu)


Do we really need xenhost for cpu on/offlinig?

I was in two minds about this. We only need it for the xenbus
interfaces which could very well have been just xh_default.

However, the xenhost is part of the xenbus_watch state, so
I thought it is easier to percolate that down instead of
adding xh_default all over the place.




diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 9a69d955dd5c..1655d0a039fd 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -227,14 +227,14 @@ static void shutdown_handler(struct xenbus_watch 
*watch,

  return;
   again:
-    err = xenbus_transaction_start(xh_default, &xbt);
+    err = xenbus_transaction_start(watch->xh, &xbt);
  if (err)
  return;
-    str = (char *)xenbus_read(xh_default, xbt, "control", "shutdown", 
NULL);
+    str = (char *)xenbus_read(watch->xh, xbt, "control", "shutdown", 
NULL);

  /* Ignore read errors and empty reads. */
  if (XENBUS_IS_ERR_READ(str)) {
-    xenbus_transaction_end(xh_default, xbt, 1);
+    xenbus_transaction_end(watch->xh, xbt, 1);
  return;
  }
@@ -245,9 +245,9 @@ static void shutdown_handler(struct xenbus_watch 
*watch,

  /* Only acknowledge commands which we are prepared to handle. */
  if (idx < ARRAY_SIZE(shutdown_handlers))
-    xenbus_write(xh_default, xbt, "control", "shutdown", "");
+    xenbus_write(watch->xh, xbt, "control", "shutdown", "");
-    err = xenbus_transaction_end(xh_default, xbt, 0);
+    err = xenbus_transaction_end(watch->xh, xbt, 0);
  if (err == -EAGAIN) {
  kfree(str);
  goto again;
@@ -272,10 +272,10 @@ static void sysrq_handler(struct xenbus_watch 
*watch, const char *path,

  int err;
   again:
-    err = xenbus_transaction_start(xh_default, &xbt);
+    err = xenbus_transaction_start(watch->xh, &xbt);
  if (err)
  return;
-    err = xenbus_scanf(xh_default, xbt, "control", "sysrq", "%c", 
&sysrq_key);
+    err = xenbus_scanf(watch->xh, xbt, "control", "sysrq", "%c", 
&sysrq_key);

  if (err < 0) {
  /*
   * The Xenstore watch fires directly after registering it and
@@ -287,21 +287,21 @@ static void sysrq_handler(struct xenbus_watch 
*watch, const char *path,

  if (err != -ENOENT && err != -ERANGE)
  pr_err("Error %d reading sysrq code in control/sysrq\n",
 err);
-    xenbus_transaction_end(xh_default, xbt, 1);
+    xenbus_transaction_end(watch->xh, xbt, 1);
  return;
  }
  if (sysrq_key != '\0') {
-    err = xenbus_printf(xh_default, xbt, "control", "sysrq", 
"%c", '\0');
+    err = xenbus_printf(watch->xh, xbt, "control", "sysrq", "%c", 
'\0');

  if (err) {
  pr_err("%s: Error %d writing sysrq in control/sysrq\n",
 __func__, err);
-    xenbus_transaction_end(xh_default, xbt, 1);
+    xenbus_transaction_end(watch->xh, xbt, 1);
  return;
  }
  }
-    err = xenbus_transaction_end(xh_default, xbt, 0);
+    err = xenbus_transaction_end(watch->xh, xbt, 0);
  if (err == -EAGAIN)
  goto again;
@@ -324,14 +324,14 @@ static struct notifier_block xen_reboot_nb = {
  .notifier_call = poweroff_nb,
  };
-static int setup_shutdown_watcher(void)
+static int setup_shutdown_watcher(xenhost_t *xh)


I think shutdown is purely local, too.

Yes, I introduced xenhost for the same reason as above.

I agree that either of these cases (and similar others) have no use
for the concept of xenhost. Do you think it makes sense for these
to pass NULL instead and the underlying interface would just assume
xh_default.

Ankur




Juergen


Re: [RFC PATCH 16/16] xen/grant-table: host_addr fixup in mapping on xenhost_r0

2019-06-18 Thread Ankur Arora

On 6/17/19 3:55 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Xenhost type xenhost_r0 does not support standard GNTTABOP_map_grant_ref
semantics (map a gref onto a specified host_addr). That's because
since the hypervisor is local (same address space as the caller of
GNTTABOP_map_grant_ref), there is no external entity that could
map an arbitrary page underneath an arbitrary address.

To handle this, the GNTTABOP_map_grant_ref hypercall on xenhost_r0
treats the host_addr as an OUT parameter instead of IN and expects the
gnttab_map_refs() and similar to fixup any state that caches the
value of host_addr from before the hypercall.

Accordingly gnttab_map_refs() now adds two parameters, a fixup function
and a pointer to cached maps to fixup:
  int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref 
*map_ops,

  struct gnttab_map_grant_ref *kmap_ops,
-    struct page **pages, unsigned int count)
+    struct page **pages, gnttab_map_fixup_t map_fixup_fn,
+    void **map_fixup[], unsigned int count)

The reason we use a fixup function and not an additional mapping op
in the xenhost_t is because, depending on the caller, what we are fixing
might be different: blkback, netback for instance cache host_addr in
via a struct page *, while __xenbus_map_ring() caches a phys_addr.

This patch fixes up xen-blkback and xen-gntdev drivers.

TODO:
   - also rewrite gnttab_batch_map() and __xenbus_map_ring().
   - modify xen-netback, scsiback, pciback etc

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 


Without seeing the __xenbus_map_ring() modification it is impossible to
do a proper review of this patch.

Will do in v2.

Ankur




Juergen


Re: [RFC PATCH 14/16] xen/blk: gnttab, evtchn, xenbus API changes

2019-06-18 Thread Ankur Arora

On 6/17/19 3:14 AM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

For the most part, we now pass xenhost_t * as a parameter.

Co-developed-by: Joao Martins 
Signed-off-by: Ankur Arora 


I don't see how this can be a patch on its own.

Yes, the reason this was separate was that given this was an
RFC, I didn't want to pollute the logic page with lots of
mechanical changes.



The only way to be able to use a patch for each driver would be to
keep the original grant-, event- and xenbus-interfaces and add the
new ones taking xenhost * with a new name. The original interfaces
could then use xenhost_default and you can switch them to the new
interfaces one by one. The last patch could then remove the old
interfaces when there is no user left.

Yes, this makes sense.

Ankur




Juergen


Re: [RFC PATCH 01/16] x86/xen: add xenhost_t interface

2019-06-11 Thread Ankur Arora

On 2019-06-07 8:04 a.m., Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Add xenhost_t which will serve as an abstraction over Xen interfaces.
It co-exists with the PV/HVM/PVH abstractions (x86_init, hypervisor_x86,
pv_ops etc) and is meant to capture mechanisms for communication with
Xen so we could have different types of underlying Xen: regular, local,
and nested.

Also add xenhost_register() and stub registration in the various guest
types.

Signed-off-by: Ankur Arora 
---
  arch/x86/xen/Makefile    |  1 +
  arch/x86/xen/enlighten_hvm.c | 13 +
  arch/x86/xen/enlighten_pv.c  | 16 ++
  arch/x86/xen/enlighten_pvh.c | 12 +
  arch/x86/xen/xenhost.c   | 75 
  include/xen/xen.h    |  3 ++
  include/xen/xenhost.h    | 95 
  7 files changed, 215 insertions(+)
  create mode 100644 arch/x86/xen/xenhost.c
  create mode 100644 include/xen/xenhost.h

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 084de77a109e..564b4dddbc15 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -18,6 +18,7 @@ obj-y    += mmu.o
  obj-y    += time.o
  obj-y    += grant-table.o
  obj-y    += suspend.o
+obj-y    += xenhost.o
  obj-$(CONFIG_XEN_PVHVM)    += enlighten_hvm.o
  obj-$(CONFIG_XEN_PVHVM)    += mmu_hvm.o
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 0e75642d42a3..100452f4f44c 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -5,6 +5,7 @@
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -82,6 +83,12 @@ static void __init xen_hvm_init_mem_mapping(void)
  xen_vcpu_info_reset(0);
  }
+xenhost_ops_t xh_hvm_ops = {
+};
+
+xenhost_ops_t xh_hvm_nested_ops = {
+};
+
  static void __init init_hvm_pv_info(void)
  {
  int major, minor;
@@ -179,6 +186,12 @@ static void __init xen_hvm_guest_init(void)
  {
  if (xen_pv_domain())
  return;
+    /*
+ * We need only xenhost_r1 for HVM guests since they cannot be
+ * driver domain (?) or dom0.


I think even HVM guests could (in theory) be driver domains.


+ */
+    if (!xen_pvh_domain())
+    xenhost_register(xenhost_r1, &xh_hvm_ops);
  init_hvm_pv_info();
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index c54a493e139a..bb6e811c1525 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -36,6 +36,7 @@
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -1188,6 +1189,12 @@ static void __init 
xen_dom0_set_legacy_features(void)

  x86_platform.legacy.rtc = 1;
  }
+xenhost_ops_t xh_pv_ops = {
+};
+
+xenhost_ops_t xh_pv_nested_ops = {
+};
+
  /* First C function to be called on Xen boot */
  asmlinkage __visible void __init xen_start_kernel(void)
  {
@@ -1198,6 +1205,15 @@ asmlinkage __visible void __init 
xen_start_kernel(void)

  if (!xen_start_info)
  return;
+    xenhost_register(xenhost_r1, &xh_pv_ops);
+
+    /*
+ * Detect in some implementation defined manner whether this is
+ * nested or not.
+ */
+    if (xen_driver_domain() && xen_nested())
+    xenhost_register(xenhost_r2, &xh_pv_nested_ops);


I don't think a driver domain other than dom0 "knows" this in the
beginning. It will need to register xenhost_r2

Right. No point in needlessly registrating as xenhost_r2 without
needing to handle any xenhost_r2 devices.


 in case it learns about a pv device from L0 hypervisor.

What's the mechanism you are thinking of, for this?
I'm guessing this PV device notification could arrive at an
arbitrary point in time after the system has booted.

The earlier reason for my assumption that the driver-domain
would "know" this at boot, was because it seemed to me
that we would need to setup hypercall/shared_info/vcpu_info.

Given that we don't need cpuid/hypercall/shared_info, the remaining
few look like they could be made dynamically callable with a bit
of refactoring:
- vcpu_info: the registration logic (xen_vcpu_setup() and friends)
  seems straight-forwardly adaptable to be called dynamically for
  xenhost_r2. Places where we touch the vcpu_info bits (xen_irq_ops)
  also seem fine.
- evtchn: xenhost_r2 should only need interdomain evtchns, so
  should be easy to defer to until we get a xenhost_r2 device.
- grant-table/xenbus: the xenhost_r2 logic (in the current patchset)
  expects to be inited at core_initcall and postcore_initcall
  respectively. Again, doesn't




+
  xen_domain_type = XEN_PV_DOMAIN;
  xen_start_flags = xen_start_info->flags;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 35b7599d2d0b..826c296d27a3 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -8,6 +8,7 @@
  #include 
  #include 
+#include 
  #include 
  #incl

Re: [RFC PATCH 06/16] x86/xen: add shared_info support to xenhost_t

2019-06-07 Thread Ankur Arora

On 2019-06-07 8:08 a.m., Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

HYPERVISOR_shared_info is used for irq/evtchn communication between the
guest and the host. Abstract out the setup/reset in xenhost_t such that
nested configurations can use both xenhosts simultaneously.


I have mixed feelings about this patch. Most of the shared_info stuff we
don't need for the nested case. In the end only the event channels might
be interesting, but we obviously want them not for all vcpus of the L1
hypervisor, but for those of the current guest.

Agreed about the mixed feelings part. shared_info does feel far too
heavy to drag along just for the event-channel state.
Infact, on thinking a bit more, a better abstraction for nested
event-channels would have been as an extension to the primary
xenhost's event-channel bits.
(The nested upcalls also go via the primary xenhost in patch-8.)

Ankur



So I think just drop that patch for now. We can dig it out later in case > 
nesting wants it again.


Juergen




Re: [RFC PATCH 00/16] xenhost support

2019-06-07 Thread Ankur Arora

On 2019-06-07 7:51 a.m., Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Hi all,

This is an RFC for xenhost support, outlined here by Juergen here:
https://lkml.org/lkml/2019/4/8/67.


First: thanks for all the effort you've put into this series!


The high level idea is to provide an abstraction of the Xen
communication interface, as a xenhost_t.

xenhost_t expose ops for communication between the guest and Xen
(hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on top
of those, xenbus, ballooning), and these can differ based on the kind
of underlying Xen: regular, local, and nested.


I'm not sure we need to abstract away hypercalls and cpuid. I believe in
case of nested Xen all contacts to the L0 hypervisor should be done via
the L1 hypervisor. So we might need to issue some kind of passthrough

Yes, that does make sense. This also allows the L1 hypervisor to
control which hypercalls can be nested.
As for cpuid, what about nested feature discovery such as in
gnttab_need_v2()?
(Though for this particular case, the hypercall should be fine.)


hypercall when e.g. granting a page to L0 dom0, but this should be
handled via the grant abstraction (events should be similar).

So IMO we should drop patches 2-5.

For 3-5, I'd like to prune them to provide a limited hypercall
registration ability -- this is meant to be used for the
xenhost_r0/xenhost_local case.

Ankur




(Since this abstraction is largely about guest -- xenhost communication,
no ops are needed for timer, clock, sched, memory (MMU, P2M), VCPU mgmt.
etc.)

Xenhost use-cases:

Regular-Xen: the standard Xen interface presented to a guest,
specifically for comunication between Lx-guest and Lx-Xen.

Local-Xen: a Xen like interface which runs in the same address space as
the guest (dom0). This, can act as the default xenhost.

The major ways it differs from a regular Xen interface is in presenting
a different hypercall interface (call instead of a syscall/vmcall), and
in an inability to do grant-mappings: since local-Xen exists in the same
address space as Xen, there's no way for it to cheaply change the
physical page that a GFN maps to (assuming no P2M tables.)

Nested-Xen: this channel is to Xen, one level removed: from L1-guest to
L0-Xen. The use case is that we want L0-dom0-backends to talk to
L1-dom0-frontend drivers which can then present PV devices which can
in-turn be used by the L1-dom0-backend drivers as raw underlying devices.
The interfaces themselves, broadly remain similar.

Note: L0-Xen, L1-Xen represent Xen running at that nesting level
and L0-guest, L1-guest represent guests that are children of Xen
at that nesting level. Lx, represents any level.

Patches 1-7,
   "x86/xen: add xenhost_t interface"
   "x86/xen: cpuid support in xenhost_t"
   "x86/xen: make hypercall_page generic"
   "x86/xen: hypercall support for xenhost_t"
   "x86/xen: add feature support in xenhost_t"
   "x86/xen: add shared_info support to xenhost_t"
   "x86/xen: make vcpu_info part of xenhost_t"
abstract out interfaces that setup 
hypercalls/cpuid/shared_info/vcpu_info etc.


Patch 8, "x86/xen: irq/upcall handling with multiple xenhosts"
sets up the upcall and pv_irq ops based on vcpu_info.

Patch 9, "xen/evtchn: support evtchn in xenhost_t" adds xenhost based
evtchn support for evtchn_2l.

Patches 10 and 16, "xen/balloon: support ballooning in xenhost_t" and
"xen/grant-table: host_addr fixup in mapping on xenhost_r0"
implement support from GNTTABOP_map_grant_ref for xenhosts of type
xenhost_r0 (xenhost local.)

Patch 12, "xen/xenbus: support xenbus frontend/backend with xenhost_t"
makes xenbus so that both its frontend and backend can be bootstrapped
separately via separate xenhosts.

Remaining patches, 11, 13, 14, 15:
   "xen/grant-table: make grant-table xenhost aware"
   "drivers/xen: gnttab, evtchn, xenbus API changes"
   "xen/blk: gnttab, evtchn, xenbus API changes"
   "xen/net: gnttab, evtchn, xenbus API changes"
are mostly mechanical changes for APIs that now take xenhost_t *
as parameter.

The code itself is RFC quality, and is mostly meant to get feedback 
before

proceeding further. Also note that the FIFO logic and some Xen drivers
(input, pciback, scsi etc) are mostly unchanged, so will not build.


Please take a look.



Juergen




Re: [Xen-devel] [RFC PATCH 00/16] xenhost support

2019-06-07 Thread Ankur Arora

On 2019-06-07 9:21 a.m., Juergen Gross wrote:

On 07.06.19 17:22, Joao Martins wrote:

On 6/7/19 3:51 PM, Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Hi all,

This is an RFC for xenhost support, outlined here by Juergen here:
https://lkml.org/lkml/2019/4/8/67.


First: thanks for all the effort you've put into this series!


The high level idea is to provide an abstraction of the Xen
communication interface, as a xenhost_t.

xenhost_t expose ops for communication between the guest and Xen
(hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on 
top

of those, xenbus, ballooning), and these can differ based on the kind
of underlying Xen: regular, local, and nested.


I'm not sure we need to abstract away hypercalls and cpuid. I believe in
case of nested Xen all contacts to the L0 hypervisor should be done via
the L1 hypervisor. So we might need to issue some kind of passthrough
hypercall when e.g. granting a page to L0 dom0, but this should be
handled via the grant abstraction (events should be similar).

Just to be clear: By "kind of passthrough hypercall" you mean (e.g. 
for every
access/modify of grant table frames) you would proxy hypercall to L0 
Xen via L1 Xen?


It might be possible to spare some hypercalls by directly writing to
grant frames mapped into L1 dom0, but in general you are right.

Wouldn't we still need map/unmap_grant_ref?
AFAICS, both the xenhost_direct and the xenhost_indirect cases should be
very similar (apart from the need to proxy in the indirect case.)

Ankur




Juergen

___
Xen-devel mailing list
xen-de...@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel




Re: cputime takes cstate into consideration

2019-07-08 Thread Ankur Arora

On 2019-06-26 12:23 p.m., Thomas Gleixner wrote:

On Wed, 26 Jun 2019, Raslan, KarimAllah wrote:

On Wed, 2019-06-26 at 10:54 -0400, Konrad Rzeszutek Wilk wrote:

There were some ideas that Ankur (CC-ed) mentioned to me of using the perf
counters (in the host) to sample the guest and construct a better
accounting idea of what the guest does. That way the dashboard
from the host would not show 100% CPU utilization.


You can either use the UNHALTED cycles perf-counter or you can use MPERF/APERF
MSRs for that. (sorry I got distracted and forgot to send the patch)


Sure, but then you conflict with the other people who fight tooth and nail
over every single performance counter.

How about using Intel PT PwrEvt extensions? This should allow us to
precisely track idle residency via just MWAIT and TSC packets. Should
be pretty cheap too. It's post Cascade Lake though.

Ankur



Thanks,

tglx





Re: cputime takes cstate into consideration

2019-07-09 Thread Ankur Arora

On 7/9/19 5:38 AM, Peter Zijlstra wrote:

On Mon, Jul 08, 2019 at 07:00:08PM -0700, Ankur Arora wrote:

On 2019-06-26 12:23 p.m., Thomas Gleixner wrote:

On Wed, 26 Jun 2019, Raslan, KarimAllah wrote:

On Wed, 2019-06-26 at 10:54 -0400, Konrad Rzeszutek Wilk wrote:

There were some ideas that Ankur (CC-ed) mentioned to me of using the perf
counters (in the host) to sample the guest and construct a better
accounting idea of what the guest does. That way the dashboard
from the host would not show 100% CPU utilization.


You can either use the UNHALTED cycles perf-counter or you can use MPERF/APERF
MSRs for that. (sorry I got distracted and forgot to send the patch)


Sure, but then you conflict with the other people who fight tooth and nail
over every single performance counter.

How about using Intel PT PwrEvt extensions? This should allow us to
precisely track idle residency via just MWAIT and TSC packets. Should
be pretty cheap too. It's post Cascade Lake though.


That would fully claim PT just for this stupid accounting thing and be
completely Intel specific.

Just stop this madness already.

I see the point about just accruing guest time (in mwait or not) as
guest CPU time.
But, to take this madness a little further, I'm not sure I see why it
fully claims PT. AFAICS, we should be able to enable PwrEvt and whatever
else simultaneously.

Ankur





[PATCH 1/2 v2] xen/acpi: Replace hard coded "ACPI0007"

2017-03-21 Thread Ankur Arora
Replace hard coded "ACPI0007" with ACPI_PROCESSOR_DEVICE_HID

Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Ankur Arora 
---
 drivers/xen/xen-acpi-processor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index 4ce10bc..fac0d7b 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -408,7 +408,7 @@ static int check_acpi_ids(struct acpi_processor *pr_backup)
acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
ACPI_UINT32_MAX,
read_acpi_id, NULL, NULL, NULL);
-   acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL);
+   acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, NULL, NULL);
 
 upload:
if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) {
-- 
2.7.4



[PATCH 0/2 v2] xen/acpi: upload PM state from init-domain to Xen

2017-03-21 Thread Ankur Arora
This patch series re-enables the upload of PM data from initial-domain
to Xen. This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4.

The upload now happens post-resume in workqueue context. From the
POV of Xen, the PM upload might be delayed a little but should be
fine -- Xen falls-back on more limited P and C states.

Tested C-state upload via mwait_idle=0.

Changes in v2:
 - rebased to 4.11.0-rc2
 - addressed comments from Boris Ostrovsky

Ankur Arora (2):
  xen/acpi: Replace hard coded "ACPI0007"
  xen/acpi: upload PM state from init-domain to Xen

 drivers/xen/xen-acpi-processor.c | 36 +++-
 1 file changed, 27 insertions(+), 9 deletions(-)

-- 
2.7.4



[PATCH 2/2 v2] xen/acpi: upload PM state from init-domain to Xen

2017-03-21 Thread Ankur Arora
This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4.
do_suspend (from xen/manage.c) and thus xen_resume_notifier never get
called on the initial-domain at resume (it is if running as guest.)

The rationale for the breaking change was that upload_pm_data()
potentially does blocking work in syscore_resume(). This patch
addresses the original issue by scheduling upload_pm_data() to
execute in workqueue context.

Changes in v2:
 - rebased to 4.11.0-rc2
 - addressed comments from Boris Ostrovsky

Cc: Stanislaw Gruszka 
Cc: sta...@vger.kernel.org
Based-on-patch-by: Konrad Wilk 
Reviewed-by: Konrad Rzeszutek Wilk 
Signed-off-by: Ankur Arora 
---
 drivers/xen/xen-acpi-processor.c | 34 ++
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c
index fac0d7b..23e391d 100644
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@@ -27,10 +27,10 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -466,15 +466,33 @@ static int xen_upload_processor_pm_data(void)
return rc;
 }
 
-static int xen_acpi_processor_resume(struct notifier_block *nb,
-unsigned long action, void *data)
+static void xen_acpi_processor_resume_worker(struct work_struct *dummy)
 {
+   int rc;
+
bitmap_zero(acpi_ids_done, nr_acpi_bits);
-   return xen_upload_processor_pm_data();
+
+   rc = xen_upload_processor_pm_data();
+   if (rc != 0)
+   pr_info("ACPI data upload failed, error = %d\n", rc);
+}
+
+static void xen_acpi_processor_resume(void)
+{
+   static DECLARE_WORK(wq, xen_acpi_processor_resume_worker);
+
+   /*
+* xen_upload_processor_pm_data() calls non-atomic code.
+* However, the context for xen_acpi_processor_resume is syscore
+* with only the boot CPU online and in an atomic context.
+*
+* So defer the upload for some point safer.
+*/
+   schedule_work(&wq);
 }
 
-struct notifier_block xen_acpi_processor_resume_nb = {
-   .notifier_call = xen_acpi_processor_resume,
+static struct syscore_ops xap_syscore_ops = {
+   .resume = xen_acpi_processor_resume,
 };
 
 static int __init xen_acpi_processor_init(void)
@@ -527,7 +545,7 @@ static int __init xen_acpi_processor_init(void)
if (rc)
goto err_unregister;
 
-   xen_resume_notifier_register(&xen_acpi_processor_resume_nb);
+   register_syscore_ops(&xap_syscore_ops);
 
return 0;
 err_unregister:
@@ -544,7 +562,7 @@ static void __exit xen_acpi_processor_exit(void)
 {
int i;
 
-   xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb);
+   unregister_syscore_ops(&xap_syscore_ops);
kfree(acpi_ids_done);
kfree(acpi_id_present);
kfree(acpi_id_cst_present);
-- 
2.7.4



Re: [PATCH 2/2 v2] xen/acpi: upload PM state from init-domain to Xen

2017-03-22 Thread Ankur Arora

On 2017-03-22 02:05 AM, Stanislaw Gruszka wrote:

On Tue, Mar 21, 2017 at 03:43:38PM -0700, Ankur Arora wrote:

This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4.
do_suspend (from xen/manage.c) and thus xen_resume_notifier never get
called on the initial-domain at resume (it is if running as guest.)

The rationale for the breaking change was that upload_pm_data()
potentially does blocking work in syscore_resume(). This patch
addresses the original issue by scheduling upload_pm_data() to
execute in workqueue context.


It is ok to do upload_pm_data() with delay i.e. after some other
resume actions are done and possibly xen-acpi-processor is in
running state ?

The state uploaded is ACPI P and C state from struct acpi_processor
which AFAICS is stable once inited so a delay would not lead to
invalid state.
The only concern would be the ACPI pCPU hotplug logic in
acpi_processor_add() which could add a new entry in
per_cpu(processors) but that also looks okay because either we
get a NULL or we get a pointer to an inited structure.

As for the hypervisor -- that falls back to more limited state after
resume (because some of this state is thrown away at suspend) and so
uses that until it gets the uploaded PM state from the initial-domain.

Ankur



Stanislaw



[PATCH 0/8] Use uncached writes while clearing gigantic pages

2020-10-14 Thread Ankur Arora
This series adds clear_page_nt(), a non-temporal MOV (MOVNTI) based
clear_page().

The immediate use case is to speedup creation of large (~2TB) guests
VMs. Memory for these guests is allocated via huge/gigantic pages which
are faulted in early.

The intent behind using non-temporal writes is to minimize allocation of
unnecessary cachelines. This helps in minimizing cache pollution, and
potentially also speeds up zeroing of large extents.

That said there are, uncached writes are not always great, as can be seen
in these 'perf bench mem memset' numbers comparing clear_page_erms() and
clear_page_nt():

Intel Broadwellx:
  x86-64-stosb (5 runs) x86-64-movnt (5 runs)   speedup
  ---   --- ---
 sizeBW   (   pstdev)  BW   (   pstdev)
 16MB  17.35 GB/s ( +- 9.27%)11.83 GB/s ( +- 0.19%) -31.81%
128MB   5.31 GB/s ( +- 0.13%)11.72 GB/s ( +- 0.44%)+121.84%

AMD Rome:
  x86-64-stosq (5 runs) x86-64-movnt (5 runs)  speedup
  ---   --- ---
 sizeBW   (   pstdev)  BW   (   pstdev)
 16MB  15.39 GB/s ( +- 9.14%)14.56 GB/s ( +-19.43%) -5.39%
128MB  11.04 GB/s ( +- 4.87%)14.49 GB/s ( +-13.22%)+31.25%

Intel Skylakex:
  x86-64-stosb (5 runs) x86-64-movnt (5 runs)  speedup
  ---   ------
 sizeBW   (   pstdev)  BW   (   pstdev)
 16MB  20.38 GB/s ( +- 2.58%) 6.25 GB/s ( +- 0.41%)   -69.28%
128MB   6.52 GB/s ( +- 0.14%) 6.31 GB/s ( +- 0.47%)-3.22%

(All of the machines in these tests had a minimum of 25MB L3 cache per
socket.)

There are two performance issues:
 - uncached writes typically perform better only for region sizes
   sizes around or larger than ~LLC-size.
 - MOVNTI does not always perform well on all microarchitectures.

We handle the first issue by only using clear_page_nt() for GB pages.

That leaves out page zeroing for 2MB pages, which is a size that's large
enough that uncached writes might have meaningful cache benefits but at
the same time is small enough that uncached writes would end up being
slower.

We can handle a subset of the 2MB case -- mmaps with MAP_POPULATE -- by
means of a uncached-or-cached hint decided based on a threshold size. This
would apply to maps backed by any page-size.
This case is not handled in this series -- I wanted to sanity check the
high level approach before attempting that.

Handle the second issue by adding a synthetic cpu-feature,
X86_FEATURE_NT_GOOD which is only enabled for architectures where MOVNTI
performs well.
(Relatedly, I thought I had independently decided to use ALTERNATIVES
to deal with this, but more likely I had just internalized it from this 
discussion:
https://lore.kernel.org/linux-mm/20200316101856.gh11...@dhcp22.suse.cz/#t)

Accordingly this series enables X86_FEATURE_NT_GOOD for Intel Broadwellx
and AMD Rome. (In my testing, the performance was also good for some
pre-production models but this series leaves them out.)

Please review.

Thanks
Ankur

Ankur Arora (8):
  x86/cpuid: add X86_FEATURE_NT_GOOD
  x86/asm: add memset_movnti()
  perf bench: add memset_movnti()
  x86/asm: add clear_page_nt()
  x86/clear_page: add clear_page_uncached()
  mm, clear_huge_page: use clear_page_uncached() for gigantic pages
  x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx
  x86/cpu/amd: enable X86_FEATURE_NT_GOOD on AMD Zen

 arch/x86/include/asm/cpufeatures.h   |  1 +
 arch/x86/include/asm/page.h  |  6 +++
 arch/x86/include/asm/page_32.h   |  9 
 arch/x86/include/asm/page_64.h   | 15 ++
 arch/x86/kernel/cpu/amd.c|  3 ++
 arch/x86/kernel/cpu/intel.c  |  2 +
 arch/x86/lib/clear_page_64.S | 26 +++
 arch/x86/lib/memset_64.S | 68 
 include/asm-generic/page.h   |  3 ++
 include/linux/highmem.h  | 10 
 mm/memory.c  |  3 +-
 tools/arch/x86/lib/memset_64.S   | 68 
 tools/perf/bench/mem-memset-x86-64-asm-def.h |  6 ++-
 13 files changed, 158 insertions(+), 62 deletions(-)

-- 
2.9.3



[PATCH 2/8] x86/asm: add memset_movnti()

2020-10-14 Thread Ankur Arora
Add a MOVNTI based implementation of memset().

memset_orig() and memset_movnti() only differ in the opcode used in the
inner loop, so move the memset_orig() logic into a macro, which gets
expanded into memset_movq() and memset_movnti().

Signed-off-by: Ankur Arora 
---
 arch/x86/lib/memset_64.S | 68 +++-
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9ff15ee404a4..79703cc04b6a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -27,7 +27,7 @@ SYM_FUNC_START(__memset)
 *
 * Otherwise, use original memset function.
 */
-   ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+   ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
  "jmp memset_erms", X86_FEATURE_ERMS
 
movq %rdi,%r9
@@ -68,7 +68,8 @@ SYM_FUNC_START_LOCAL(memset_erms)
ret
 SYM_FUNC_END(memset_erms)
 
-SYM_FUNC_START_LOCAL(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START_LOCAL(memset_\OP)
movq %rdi,%r10
 
/* expand byte value  */
@@ -79,64 +80,71 @@ SYM_FUNC_START_LOCAL(memset_orig)
/* align dst */
movl  %edi,%r9d
andl  $7,%r9d
-   jnz  .Lbad_alignment
-.Lafter_bad_alignment:
+   jnz  .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
 
movq  %rdx,%rcx
shrq  $6,%rcx
-   jz   .Lhandle_tail
+   jz   .Lhandle_tail_\@
 
.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq  %rcx
-   movq  %rax,(%rdi)
-   movq  %rax,8(%rdi)
-   movq  %rax,16(%rdi)
-   movq  %rax,24(%rdi)
-   movq  %rax,32(%rdi)
-   movq  %rax,40(%rdi)
-   movq  %rax,48(%rdi)
-   movq  %rax,56(%rdi)
+   \OP  %rax,(%rdi)
+   \OP  %rax,8(%rdi)
+   \OP  %rax,16(%rdi)
+   \OP  %rax,24(%rdi)
+   \OP  %rax,32(%rdi)
+   \OP  %rax,40(%rdi)
+   \OP  %rax,48(%rdi)
+   \OP  %rax,56(%rdi)
leaq  64(%rdi),%rdi
-   jnz.Lloop_64
+   jnz.Lloop_64_\@
 
/* Handle tail in loops. The loops should be faster than hard
   to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl%edx,%ecx
andl$63&(~7),%ecx
-   jz  .Lhandle_7
+   jz  .Lhandle_7_\@
shrl$3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl   %ecx
-   movq  %rax,(%rdi)
+   \OP  %rax,(%rdi)
leaq  8(%rdi),%rdi
-   jnz.Lloop_8
+   jnz.Lloop_8_\@
 
-.Lhandle_7:
+.Lhandle_7_\@:
andl$7,%edx
-   jz  .Lende
+   jz  .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl%edx
movb%al,(%rdi)
leaq1(%rdi),%rdi
-   jnz .Lloop_1
+   jnz .Lloop_1_\@
 
-.Lende:
+.Lende_\@:
+   .if \fence
+   sfence
+   .endif
movq%r10,%rax
ret
 
-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
-   jbe .Lhandle_7
+   jbe .Lhandle_7_\@
movq %rax,(%rdi)/* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
-   jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+   jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
-- 
2.9.3



[PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-14 Thread Ankur Arora
Define clear_page_uncached() as an alternative_call() to clear_page_nt()
if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it
doesn't.

Similarly define clear_page_uncached_flush() which provides an SFENCE
if the CPU sets X86_FEATURE_NT_GOOD.

Also, add the glue interface clear_user_highpage_uncached().

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/page.h|  6 ++
 arch/x86/include/asm/page_32.h |  9 +
 arch/x86/include/asm/page_64.h | 14 ++
 include/asm-generic/page.h |  3 +++
 include/linux/highmem.h| 10 ++
 5 files changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..ca0aa379ac7f 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -28,6 +28,12 @@ static inline void clear_user_page(void *page, unsigned long 
vaddr,
clear_page(page);
 }
 
+static inline void clear_user_page_uncached(void *page, unsigned long vaddr,
+   struct page *pg)
+{
+   clear_page_uncached(page);
+}
+
 static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
  struct page *topage)
 {
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index 94dbd51df58f..7a03a274a9a4 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -39,6 +39,15 @@ static inline void clear_page(void *page)
memset(page, 0, PAGE_SIZE);
 }
 
+static inline void clear_page_uncached(void *page)
+{
+   clear_page(page);
+}
+
+static inline void clear_page_uncached_flush(void)
+{
+}
+
 static inline void copy_page(void *to, void *from)
 {
memcpy(to, from, PAGE_SIZE);
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index bde3c2785ec4..5897075e77dd 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -55,6 +55,20 @@ static inline void clear_page(void *page)
   : "cc", "memory", "rax", "rcx");
 }
 
+static inline void clear_page_uncached(void *page)
+{
+   alternative_call(clear_page,
+clear_page_nt, X86_FEATURE_NT_GOOD,
+"=D" (page),
+"0" (page)
+: "cc", "memory", "rax", "rcx");
+}
+
+static inline void clear_page_uncached_flush(void)
+{
+   alternative("", "sfence", X86_FEATURE_NT_GOOD);
+}
+
 void copy_page(void *to, void *from);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h
index fe801f01625e..60235a0cf24a 100644
--- a/include/asm-generic/page.h
+++ b/include/asm-generic/page.h
@@ -26,6 +26,9 @@
 #ifndef __ASSEMBLY__
 
 #define clear_page(page)   memset((page), 0, PAGE_SIZE)
+#define clear_page_uncached(page)  clear_page(page)
+#define clear_page_uncached_flush()do { } while (0)
+
 #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE)
 
 #define clear_user_page(page, vaddr, pg)   clear_page(page)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 14e6202ce47f..f842593e2474 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -232,6 +232,16 @@ static inline void clear_user_highpage(struct page *page, 
unsigned long vaddr)
 }
 #endif
 
+#ifndef clear_user_highpage_uncached
+static inline void clear_user_highpage_uncached(struct page *page, unsigned 
long vaddr)
+{
+   void *addr = kmap_atomic(page);
+
+   clear_user_page_uncached(addr, vaddr, page);
+   kunmap_atomic(addr);
+}
+#endif
+
 #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /**
  * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA 
with caller-specified movable GFP flags
-- 
2.9.3



[PATCH 3/8] perf bench: add memset_movnti()

2020-10-14 Thread Ankur Arora
Clone memset_movnti() from arch/x86/lib/memset_64.S.

perf bench mem memset on -f x86-64-movnt on Intel Broadwellx, Skylakex
and AMD Rome:

Intel Broadwellx:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
  done

  # Output pruned.
  # Running 'mem/memset' benchmark:
  # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
  # Copying 2MB bytes ...
11.837121 GB/sec
  # Copying 8MB bytes ...
11.783560 GB/sec
  # Copying 32MB bytes ...
11.868591 GB/sec
  # Copying 128MB bytes ...
11.865211 GB/sec
  # Copying 512MB bytes ...
11.864085 GB/sec

Intel Skylakex:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
  done
  # Running 'mem/memset' benchmark:
  # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
  # Copying 2MB bytes ...
 6.361971 GB/sec
  # Copying 8MB bytes ...
 6.300403 GB/sec
  # Copying 32MB bytes ...
 6.288992 GB/sec
  # Copying 128MB bytes ...
 6.328793 GB/sec
  # Copying 512MB bytes ...
 6.324471 GB/sec

AMD Rome:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
  done
  # Running 'mem/memset' benchmark:
  # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
  # Copying 2MB bytes ...
10.993199 GB/sec
  # Copying 8MB bytes ...
14.221784 GB/sec
  # Copying 32MB bytes ...
14.293337 GB/sec
  # Copying 128MB bytes ...
15.238947 GB/sec
  # Copying 512MB bytes ...
16.476093 GB/sec

Signed-off-by: Ankur Arora 
---
 tools/arch/x86/lib/memset_64.S   | 68 
 tools/perf/bench/mem-memset-x86-64-asm-def.h |  6 ++-
 2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index fd5d25a474b7..bfbf6d06f81e 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -26,7 +26,7 @@ SYM_FUNC_START(__memset)
 *
 * Otherwise, use original memset function.
 */
-   ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+   ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
  "jmp memset_erms", X86_FEATURE_ERMS
 
movq %rdi,%r9
@@ -65,7 +65,8 @@ SYM_FUNC_START(memset_erms)
ret
 SYM_FUNC_END(memset_erms)
 
-SYM_FUNC_START(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START(memset_\OP)
movq %rdi,%r10
 
/* expand byte value  */
@@ -76,64 +77,71 @@ SYM_FUNC_START(memset_orig)
/* align dst */
movl  %edi,%r9d
andl  $7,%r9d
-   jnz  .Lbad_alignment
-.Lafter_bad_alignment:
+   jnz  .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
 
movq  %rdx,%rcx
shrq  $6,%rcx
-   jz   .Lhandle_tail
+   jz   .Lhandle_tail_\@
 
.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq  %rcx
-   movq  %rax,(%rdi)
-   movq  %rax,8(%rdi)
-   movq  %rax,16(%rdi)
-   movq  %rax,24(%rdi)
-   movq  %rax,32(%rdi)
-   movq  %rax,40(%rdi)
-   movq  %rax,48(%rdi)
-   movq  %rax,56(%rdi)
+   \OP  %rax,(%rdi)
+   \OP  %rax,8(%rdi)
+   \OP  %rax,16(%rdi)
+   \OP  %rax,24(%rdi)
+   \OP  %rax,32(%rdi)
+   \OP  %rax,40(%rdi)
+   \OP  %rax,48(%rdi)
+   \OP  %rax,56(%rdi)
leaq  64(%rdi),%rdi
-   jnz.Lloop_64
+   jnz.Lloop_64_\@
 
/* Handle tail in loops. The loops should be faster than hard
   to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl%edx,%ecx
andl$63&(~7),%ecx
-   jz  .Lhandle_7
+   jz  .Lhandle_7_\@
shrl$3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl   %ecx
-   movq  %rax,(%rdi)
+   \OP  %rax,(%rdi)
leaq  8(%rdi),%rdi
-   jnz.Lloop_8
+   jnz.Lloop_8_\@
 
-.Lhandle_7:
+.Lhandle_7_\@:
andl$7,%edx
-   jz  .Lende
+   jz  .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl%edx
movb%al,(%rdi)
leaq1(%rdi),%rdi
-   jnz .Lloop_1
+   jnz .Lloop_1_\@
 
-.Lende:
+.Lende_\@:
+   .if \fence
+   sfence
+   .endif
movq%r10,%rax
ret
 
-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
-   jbe .Lhandle_7
+   jbe .Lhandle_7_\@
movq %rax,(%rdi)/* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
-   jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+   jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence

[PATCH 4/8] x86/asm: add clear_page_nt()

2020-10-14 Thread Ankur Arora
  ( +-  1.06% )  (23.52%)
 1,966  LLC-load-misses   #1.97% of all LL-cache 
accesses  ( +-  6.17% )  (23.52%)
   129  LLC-stores#0.038 K/sec  
  ( +- 27.85% )  (11.75%)
 7  LLC-store-misses  #0.002 K/sec  
  ( +- 47.82% )  (11.75%)

   3.37465 +- 0.00474 seconds time elapsed  ( +-  0.14% )

The L1-dcache-load-misses (L1D.REPLACEMENT) count is much lower just
like the previous two cases. No performance improvement for Skylakex
though.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/page_64.h |  1 +
 arch/x86/lib/clear_page_64.S   | 26 ++
 2 files changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..bde3c2785ec4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -43,6 +43,7 @@ extern unsigned long __phys_addr_symbol(unsigned long);
 void clear_page_orig(void *page);
 void clear_page_rep(void *page);
 void clear_page_erms(void *page);
+void clear_page_nt(void *page);
 
 static inline void clear_page(void *page)
 {
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..f16bb753b236 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -50,3 +50,29 @@ SYM_FUNC_START(clear_page_erms)
ret
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Zero a page.
+ * %rdi - page
+ *
+ * Caller needs to issue a fence at the end.
+ */
+SYM_FUNC_START(clear_page_nt)
+   xorl%eax,%eax
+   movl$4096,%ecx
+
+   .p2align 4
+.Lstart:
+movnti  %rax, 0x00(%rdi)
+movnti  %rax, 0x08(%rdi)
+movnti  %rax, 0x10(%rdi)
+movnti  %rax, 0x18(%rdi)
+movnti  %rax, 0x20(%rdi)
+movnti  %rax, 0x28(%rdi)
+movnti  %rax, 0x30(%rdi)
+movnti  %rax, 0x38(%rdi)
+addq$0x40, %rdi
+subl$0x40, %ecx
+ja  .Lstart
+   ret
+SYM_FUNC_END(clear_page_nt)
-- 
2.9.3



[PATCH 7/8] x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx

2020-10-14 Thread Ankur Arora
468,643  bus-cycles#   99.276 M/sec  
  ( +-  0.01% )  (23.53%)
   717,228  L1-dcache-load-misses #0.20% of all L1-dcache 
accesses  ( +-  3.77% )  (23.53%)
   351,999,535  L1-dcache-loads   #   32.403 M/sec  
  ( +-  0.04% )  (23.53%)
75,988  LLC-loads #0.007 M/sec  
  ( +-  4.20% )  (23.53%)
24,503  LLC-load-misses   #   32.25% of all LL-cache 
accesses  ( +- 53.30% )  (23.53%)
57,283  LLC-stores#0.005 M/sec  
  ( +-  2.15% )  (11.76%)
19,738  LLC-store-misses  #0.002 M/sec  
  ( +- 46.55% )  (11.76%)
   351,836,498  dTLB-loads#   32.388 M/sec  
  ( +-  0.04% )  (17.65%)
 1,171  dTLB-load-misses  #0.00% of all dTLB cache 
accesses  ( +- 42.68% )  (23.53%)
17,385,579,725  dTLB-stores   # 1600.392 M/sec  
  ( +-  0.00% )  (23.53%)
   200  dTLB-store-misses #0.018 K/sec  
  ( +- 10.63% )  (23.53%)

 10.863678 +- 0.000804 seconds time elapsed  ( +-  0.01% )

L1-dcache-load-misses (L1D.REPLACEMENT) is substantially lower which
suggests that, as expected, we aren't doing write-allocate or RFO.

Note that the IPC and instruction counts etc are quite different, but
that's just an artifact of switching from a single 'REP; STOSB' per
PAGE_SIZE region to a MOVNTI loop.

The page-clearing BW is substantially higher (~100% or more), so enable
X86_FEATURE_NT_GOOD for Intel Broadwellx.

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/cpu/intel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 59a1e3ce3f14..161028c1dee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -662,6 +662,8 @@ static void init_intel(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size * 2;
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+   if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X)
+   set_cpu_cap(c, X86_FEATURE_NT_GOOD);
 #else
/*
 * Names for the Pentium II/Celeron processors
-- 
2.9.3



[PATCH 1/8] x86/cpuid: add X86_FEATURE_NT_GOOD

2020-10-14 Thread Ankur Arora
Enabled on microarchitectures with performant non-temporal MOV (MOVNTI)
instruction.

Signed-off-by: Ankur Arora 
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 7b0afd5e6c57..8bae38240346 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -289,6 +289,7 @@
 #define X86_FEATURE_FENCE_SWAPGS_KERNEL(11*32+ 5) /* "" LFENCE in 
kernel entry SWAPGS path */
 #define X86_FEATURE_SPLIT_LOCK_DETECT  (11*32+ 6) /* #AC for split lock */
 #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory 
Bandwidth Allocation */
+#define X86_FEATURE_NT_GOOD(11*32+ 8) /* Non-temporal instructions 
perform well */
 
 /* Intel-defined CPU features, CPUID level 0x0007:1 (EAX), word 12 */
 #define X86_FEATURE_AVX512_BF16(12*32+ 5) /* AVX512 BFLOAT16 
instructions */
-- 
2.9.3



[PATCH 8/8] x86/cpu/amd: enable X86_FEATURE_NT_GOOD on AMD Zen

2020-10-14 Thread Ankur Arora
substantially lower which suggests we aren't doing write-allocate or
RFO. The L1-dcache-prefetches are also substantially lower.

Note that the IPC and instruction counts etc are quite different, but
that's just an artifact of switching from a single 'REP; STOSQ' per
PAGE_SIZE region to a MOVNTI loop.

The page-clearing BW shows a ~40% improvement. Additionally, a quick
'perf bench memset' comparison on AMD Naples (AMD EPYC 7551) shows
similar performance gains. So, enable X86_FEATURE_NT_GOOD for
AMD Zen.

Signed-off-by: Ankur Arora 
---
 arch/x86/kernel/cpu/amd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dcc3d943c68f..c57eb6c28aa1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -918,6 +918,9 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
 {
set_cpu_cap(c, X86_FEATURE_ZEN);
 
+   if (c->x86 == 0x17)
+   set_cpu_cap(c, X86_FEATURE_NT_GOOD);
+
 #ifdef CONFIG_NUMA
node_reclaim_distance = 32;
 #endif
-- 
2.9.3



[PATCH 6/8] mm, clear_huge_page: use clear_page_uncached() for gigantic pages

2020-10-14 Thread Ankur Arora
Uncached writes are suitable for circumstances where the region written to
is not expected to be read again soon, or the region written to is large
enough that there's no expectation that we will find the writes in the
cache.

Accordingly switch to using clear_page_uncached() for gigantic pages.

Signed-off-by: Ankur Arora 
---
 mm/memory.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index eeae590e526a..4d2c58f83ab1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5092,7 +5092,7 @@ static void clear_gigantic_page(struct page *page,
for (i = 0; i < pages_per_huge_page;
 i++, p = mem_map_next(p, page, i)) {
cond_resched();
-   clear_user_highpage(p, addr + i * PAGE_SIZE);
+   clear_user_highpage_uncached(p, addr + i * PAGE_SIZE);
}
 }
 
@@ -5111,6 +5111,7 @@ void clear_huge_page(struct page *page,
 
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
+   clear_page_uncached_flush();
return;
}
 
-- 
2.9.3



Re: [PATCH 6/8] mm, clear_huge_page: use clear_page_uncached() for gigantic pages

2020-10-14 Thread Ankur Arora

On 2020-10-14 8:28 a.m., Ingo Molnar wrote:


* Ankur Arora  wrote:


Uncached writes are suitable for circumstances where the region written to
is not expected to be read again soon, or the region written to is large
enough that there's no expectation that we will find the writes in the
cache.

Accordingly switch to using clear_page_uncached() for gigantic pages.

Signed-off-by: Ankur Arora 
---
  mm/memory.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index eeae590e526a..4d2c58f83ab1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5092,7 +5092,7 @@ static void clear_gigantic_page(struct page *page,
for (i = 0; i < pages_per_huge_page;
 i++, p = mem_map_next(p, page, i)) {
cond_resched();
-   clear_user_highpage(p, addr + i * PAGE_SIZE);
+   clear_user_highpage_uncached(p, addr + i * PAGE_SIZE);
}
  }


So this does the clearing in 4K chunks, and your measurements suggest that
short memory clearing is not as efficient, right?

I did not measure that separately (though I should), but the performance numbers
around that were somewhat puzzling.

For MOVNTI, the performance via perf bench (single call to memset_movnti())
is pretty close (within margin of error) to what we see with the page-fault
workload (4K chunks in clear_page_nt().)

With 'REP;STOS' though, there's degradation (~30% Broadwell, ~5% Rome) between
perf bench (single call to memset_erms()) compared to the page-fault workload
(4K chunks in clear_page_erms()).

In the second case, we are executing a lot more 'REP;STOS' loops while the
number of instructions in the first case is pretty much the same, so maybe
that's what accounts for it. But I checked and we are not frontend bound.

Maybe there are high setup costs for 'REP;STOS' on Broadwell? It does advertise
X86_FEATURE_ERMS though...



I'm wondering whether it would make sense to do 2MB chunked clearing on
64-bit CPUs, instead of 512x 4k clearing? Both 2MB and GB pages are
continuous in memory, so accessible to these instructions in a single
narrow loop.

Yeah, I think it makes sense to do and should be quite straight-forward
as well. I'll try that out. I suspect it might help the X86_FEATURE_NT_BAD
models more but there's no reason why for it to hurt anywhere.


Ankur



Thanks,




Ingo



Re: [PATCH 7/8] x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx

2020-10-14 Thread Ankur Arora

On 2020-10-14 8:31 a.m., Ingo Molnar wrote:


* Ankur Arora  wrote:


System:   Oracle X6-2
CPU:  2 nodes * 10 cores/node * 2 threads/core
  Intel Xeon E5-2630 v4 (Broadwellx, 6:79:1)
Memory:   256 GB evenly split between nodes
Microcode:0xb2e
scaling_governor: performance
L3 size:  25MB
intel_pstate/no_turbo: 1

Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb
(X86_FEATURE_ERMS) and x86-64-movnt (X86_FEATURE_NT_GOOD):

   x86-64-stosb (5 runs) x86-64-movnt (5 runs)   speedup
   ---   --- ---
  size   BW(   pstdev)  BW   (   pstdev)

  16MB  17.35 GB/s ( +- 9.27%)11.83 GB/s ( +- 0.19%) -31.81%
 128MB   5.31 GB/s ( +- 0.13%)11.72 GB/s ( +- 0.44%)+121.84%
1024MB   5.42 GB/s ( +- 0.13%)11.78 GB/s ( +- 0.03%)+117.34%
4096MB   5.41 GB/s ( +- 0.41%)11.76 GB/s ( +- 0.07%)+117.37%



+   if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X)
+   set_cpu_cap(c, X86_FEATURE_NT_GOOD);


So while I agree with how you've done careful measurements to isolate bad
microarchitectures where non-temporal stores are slow, I do think this
approach of opt-in doesn't scale and is hard to maintain.

Instead I'd suggest enabling this by default everywhere, and creating a
X86_FEATURE_NT_BAD quirk table for the bad microarchitectures.

Okay, some kind of quirk table is a great idea. Also means that there's a
single place for keeping this rather than it being scattered all over in
the code.

That also simplifies my handling of features like X86_FEATURE_CLZERO.
I was concerned that if you squint a bit, it seems to be an alias to
X86_FEATURE_NT_GOOD and that seemed ugly.



This means that with new microarchitectures we'd get automatic enablement,
and hopefully chip testing would identify cases where performance isn't as
good.

Makes sense to me. A first class citizen, as it were...

Thanks for reviewing btw.

Ankur



I.e. the 'trust but verify' method.





Thanks,

Ingo



Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-14 Thread Ankur Arora

On 2020-10-14 8:45 a.m., Andy Lutomirski wrote:

On Wed, Oct 14, 2020 at 1:33 AM Ankur Arora  wrote:


Define clear_page_uncached() as an alternative_call() to clear_page_nt()
if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it
doesn't.

Similarly define clear_page_uncached_flush() which provides an SFENCE
if the CPU sets X86_FEATURE_NT_GOOD.


As long as you keep "NT" or "MOVNTI" in the names and keep functions
in arch/x86, I think it's reasonable to expect that callers understand
that MOVNTI has bizarre memory ordering rules.  But once you give
something a generic name like "clear_page_uncached" and stick it in
generic code, I think the semantics should be more obvious.

How about:

clear_page_uncached_unordered() or clear_page_uncached_incoherent()

and

flush_after_clear_page_uncached()

After all, a naive reader might expect "uncached" to imply "caches are
off and this is coherent with everything".  And the results of getting
this wrong will be subtle and possibly hard-to-reproduce corruption.

Yeah, these are a lot more obvious. Thanks. Will fix.

Ankur



--Andy



Re: [PATCH 4/8] x86/asm: add clear_page_nt()

2020-10-14 Thread Ankur Arora

On 2020-10-14 12:56 p.m., Borislav Petkov wrote:

On Wed, Oct 14, 2020 at 01:32:55AM -0700, Ankur Arora wrote:

This can potentially improve page-clearing bandwidth (see below for
performance numbers for two microarchitectures where it helps and one
where it doesn't) and can help indirectly by consuming less cache
resources.

Any performance benefits are expected for extents larger than LLC-sized
or more -- when we are DRAM-BW constrained rather than cache-BW
constrained.


"potentially", "expected", I don't like those formulations.

That's fair. The reason for those weasel words is mostly because it
is microarchitecture specific.
For example on Intel where I did compare across generations: I see good
performance on Broadwellx, not good on Skylakex and then good again on
some pre-production CPUs.


Do you have
some actual benchmark data where this shows any improvement and not
microbenchmarks only, to warrant the additional complexity?

Yes, guest creation under QEMU (pinned guests) shows similar improvements.
I've posted performance numbers in patches 7, 8 with a simple page-fault
test derived from that.

I can add numbers from QEMU as well.

Thanks,
Ankur





Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-14 Thread Ankur Arora

On 2020-10-14 2:07 p.m., Andy Lutomirski wrote:





On Oct 14, 2020, at 12:58 PM, Borislav Petkov  wrote:

On Wed, Oct 14, 2020 at 08:45:37AM -0700, Andy Lutomirski wrote:

On Wed, Oct 14, 2020 at 1:33 AM Ankur Arora  wrote:

Define clear_page_uncached() as an alternative_call() to clear_page_nt()
if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it
doesn't.

Similarly define clear_page_uncached_flush() which provides an SFENCE
if the CPU sets X86_FEATURE_NT_GOOD.


As long as you keep "NT" or "MOVNTI" in the names and keep functions
in arch/x86, I think it's reasonable to expect that callers understand
that MOVNTI has bizarre memory ordering rules.  But once you give
something a generic name like "clear_page_uncached" and stick it in
generic code, I think the semantics should be more obvious.


Why does it have to be a separate call? Why isn't it behind the
clear_page() alternative machinery so that the proper function is
selected at boot? IOW, why does a user of clear_page functionality need
to know at all about an "uncached" variant?


I assume it’s for a little optimization of clearing more than one page
per SFENCE.

In any event, based on the benchmark data upthread, we only want to do
NT clears when they’re rather large, so this shouldn’t be just an
alternative. I assume this is because a page or two will fit in cache
and, for most uses that allocate zeroed pages, we prefer cache-hot
pages. When clearing 1G, on the other hand, cache-hot is impossible
and we prefer the improved bandwidth and less cache trashing of NT
clears.


Also, if we did extend clear_page() to take the page-size as parameter
we still might not have enough information (ex. a 4K or a 2MB page that
clear_page() sees could be part of a GUP of a much larger extent) to
decide whether to go uncached or not.


Perhaps SFENCE is so fast that this is a silly optimization, though,
and we don’t lose anything measurable by SFENCEing once per page.

Alas, no. I tried that and dropped about 15% performance on Rome.

Thanks
Ankur


Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-14 Thread Ankur Arora

On 2020-10-14 2:12 p.m., Borislav Petkov wrote:

On Wed, Oct 14, 2020 at 02:07:30PM -0700, Andy Lutomirski wrote:

I assume it’s for a little optimization of clearing more than one
page per SFENCE.

In any event, based on the benchmark data upthread, we only want to do
NT clears when they’re rather large, so this shouldn’t be just an
alternative. I assume this is because a page or two will fit in cache
and, for most uses that allocate zeroed pages, we prefer cache-hot
pages. When clearing 1G, on the other hand, cache-hot is impossible
and we prefer the improved bandwidth and less cache trashing of NT
clears.


Yeah, use case makes sense but people won't know what to use. At the
time I was experimenting with this crap, I remember Linus saying that
that selection should be made based on the size of the area cleared, so
users should not have to know the difference.

I don't disagree but I think the selection of cached/uncached route should
be made where we have enough context available to be able to choose to do
this.

This could be for example, done in mm_populate() or gup where if say the
extent is larger than LLC-size, it takes the uncached path.



Which perhaps is the only sane use case I see for this.


Perhaps SFENCE is so fast that this is a silly optimization, though,
and we don’t lose anything measurable by SFENCEing once per page.


Yes, I'd like to see real use cases showing improvement from this, not
just microbenchmarks.

Sure will add.

Thanks
Ankur





Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-15 Thread Ankur Arora

On 2020-10-15 3:35 a.m., Borislav Petkov wrote:

On Wed, Oct 14, 2020 at 08:37:44PM -0700, Ankur Arora wrote:

I don't disagree but I think the selection of cached/uncached route should
be made where we have enough context available to be able to choose to do
this.

This could be for example, done in mm_populate() or gup where if say the
extent is larger than LLC-size, it takes the uncached path.


Are there examples where we don't know the size?


The case I was thinking of was that clear_huge_page() or faultin_page() would
know the size to a page unit, while the higher level function would know the
whole extent and could optimize differently based on that.

Thanks
Ankur





Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()

2020-10-15 Thread Ankur Arora

On 2020-10-15 3:40 a.m., Borislav Petkov wrote:

On Wed, Oct 14, 2020 at 08:21:57PM -0700, Ankur Arora wrote:

Also, if we did extend clear_page() to take the page-size as parameter
we still might not have enough information (ex. a 4K or a 2MB page that
clear_page() sees could be part of a GUP of a much larger extent) to
decide whether to go uncached or not.


clear_page* assumes 4K. All of the lowlevel asm variants do. So adding
the size there won't bring you a whole lot.

So you'd need to devise this whole thing differently. Perhaps have a
clear_pages() helper which decides based on size what to do: uncached
clearing or the clear_page() as is now in a loop.


I think that'll work well for GB pages, where the clear_pages() helper
has enough information to make a decision.

But, unless I'm missing something, I'm not sure how that would work for
say, a 1GB mm_populate() using 4K pages. The clear_page() (or clear_pages())
in that case would only see the 4K size.

But let me think about this more (and look at the callsites as you suggest.)



Looking at the callsites would give you a better idea I'd say.

Thanks, yeah that's a good idea. Let me go do that.

Ankur



Thx.



Re: [PATCH] sched: introduce configurable delay before entering idle

2019-05-15 Thread Ankur Arora

On 5/14/19 6:50 AM, Marcelo Tosatti wrote:

On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:

On Wed, 8 May 2019 at 02:57, Marcelo Tosatti  wrote:



Certain workloads perform poorly on KVM compared to baremetal
due to baremetal's ability to perform mwait on NEED_RESCHED
bit of task flags (therefore skipping the IPI).


KVM supports expose mwait to the guest, if it can solve this?

Regards,
Wanpeng Li


Unfortunately mwait in guest is not feasible (uncompatible with multiple
guests). Checking whether a paravirt solution is possible.


Hi Marcelo,

I was also looking at making MWAIT available to guests in a safe manner:
whether through emulation or a PV-MWAIT. My (unsolicited) thoughts
follow.

We basically want to handle this sequence:

monitor(monitor_address);
if (*monitor_address == base_value)
 mwaitx(max_delay);

Emulation seems problematic because, AFAICS this would happen:

guest   hypervisor
=   

monitor(monitor_address);
vmexit  ===>monitor(monitor_address)
if (*monitor_address == base_value)
 mwait();
  vmexit>   mwait()

There's a context switch back to the guest in this sequence which seems
problematic. Both the AMD and Intel specs list system calls and
far calls as events which would lead to the MWAIT being woken up: 
"Voluntary transitions due to fast system call and far calls (occurring 
prior to issuing MWAIT but after setting the monitor)".



We could do this instead:

guest   hypervisor
=   

monitor(monitor_address);
vmexit  ===>cache monitor_address
if (*monitor_address == base_value)
 mwait();
  vmexit>  monitor(monitor_address)
   mwait()

But, this would miss the "if (*monitor_address == base_value)" check in
the host which is problematic if *monitor_address changed simultaneously
when monitor was executed.
(Similar problem if we cache both the monitor_address and
*monitor_address.)


So, AFAICS, the only thing that would work is the guest offloading the
whole PV-MWAIT operation.

AFAICS, that could be a paravirt operation which needs three parameters:
(monitor_address, base_value, max_delay.)

This would allow the guest to offload this whole operation to
the host:
monitor(monitor_address);
if (*monitor_address == base_value)
 mwaitx(max_delay);

I'm guessing you are thinking on similar lines?


High level semantics: If the CPU doesn't have any runnable threads, then
we actually do this version of PV-MWAIT -- arming a timer if necessary
so we only sleep until the time-slice expires or the MWAIT max_delay does.

If the CPU has any runnable threads then this could still finish its 
time-quanta or we could just do a schedule-out.



So the semantics guaranteed to the host would be that PV-MWAIT returns 
after >= max_delay OR with the *monitor_address changed.




Ankur


Re: [PATCH] sched: introduce configurable delay before entering idle

2019-05-16 Thread Ankur Arora

On 2019-05-15 6:07 p.m., Wanpeng Li wrote:

On Thu, 16 May 2019 at 02:42, Ankur Arora  wrote:


On 5/14/19 6:50 AM, Marcelo Tosatti wrote:

On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:

On Wed, 8 May 2019 at 02:57, Marcelo Tosatti  wrote:



Certain workloads perform poorly on KVM compared to baremetal
due to baremetal's ability to perform mwait on NEED_RESCHED
bit of task flags (therefore skipping the IPI).


KVM supports expose mwait to the guest, if it can solve this?

Regards,
Wanpeng Li


Unfortunately mwait in guest is not feasible (uncompatible with multiple
guests). Checking whether a paravirt solution is possible.


Hi Marcelo,

I was also looking at making MWAIT available to guests in a safe manner:
whether through emulation or a PV-MWAIT. My (unsolicited) thoughts


MWAIT emulation is not simple, here is a research
https://www.contrib.andrew.cmu.edu/~somlo/OSXKVM/mwait.html

Agreed. I had outlined my attempt to do that below and come
to the conclusion that we would need a PV-MWAIT.

Ankur



Regards,
Wanpeng Li


follow.

We basically want to handle this sequence:

  monitor(monitor_address);
  if (*monitor_address == base_value)
   mwaitx(max_delay);

Emulation seems problematic because, AFAICS this would happen:

  guest   hypervisor
  =   

  monitor(monitor_address);
  vmexit  ===>monitor(monitor_address)
  if (*monitor_address == base_value)
   mwait();
vmexit>   mwait()

There's a context switch back to the guest in this sequence which seems
problematic. Both the AMD and Intel specs list system calls and
far calls as events which would lead to the MWAIT being woken up:
"Voluntary transitions due to fast system call and far calls (occurring
prior to issuing MWAIT but after setting the monitor)".


We could do this instead:

  guest   hypervisor
  =   

  monitor(monitor_address);
  vmexit  ===>cache monitor_address
  if (*monitor_address == base_value)
   mwait();
vmexit>  monitor(monitor_address)
 mwait()

But, this would miss the "if (*monitor_address == base_value)" check in
the host which is problematic if *monitor_address changed simultaneously
when monitor was executed.
(Similar problem if we cache both the monitor_address and
*monitor_address.)


So, AFAICS, the only thing that would work is the guest offloading the
whole PV-MWAIT operation.

AFAICS, that could be a paravirt operation which needs three parameters:
(monitor_address, base_value, max_delay.)

This would allow the guest to offload this whole operation to
the host:
  monitor(monitor_address);
  if (*monitor_address == base_value)
   mwaitx(max_delay);

I'm guessing you are thinking on similar lines?


High level semantics: If the CPU doesn't have any runnable threads, then
we actually do this version of PV-MWAIT -- arming a timer if necessary
so we only sleep until the time-slice expires or the MWAIT max_delay does.

If the CPU has any runnable threads then this could still finish its
time-quanta or we could just do a schedule-out.


So the semantics guaranteed to the host would be that PV-MWAIT returns
after >= max_delay OR with the *monitor_address changed.



Ankur




Re: [PATCH] sched: introduce configurable delay before entering idle

2019-05-16 Thread Ankur Arora

On 2019-05-15 1:43 p.m., Marcelo Tosatti wrote:

On Wed, May 15, 2019 at 11:42:56AM -0700, Ankur Arora wrote:

On 5/14/19 6:50 AM, Marcelo Tosatti wrote:

On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote:

On Wed, 8 May 2019 at 02:57, Marcelo Tosatti  wrote:



Certain workloads perform poorly on KVM compared to baremetal
due to baremetal's ability to perform mwait on NEED_RESCHED
bit of task flags (therefore skipping the IPI).


KVM supports expose mwait to the guest, if it can solve this?

Regards,
Wanpeng Li


Unfortunately mwait in guest is not feasible (uncompatible with multiple
guests). Checking whether a paravirt solution is possible.


Hi Ankur,



Hi Marcelo,

I was also looking at making MWAIT available to guests in a safe manner:
whether through emulation or a PV-MWAIT. My (unsolicited) thoughts


What use-case are you interested in?

Currently Oracle does not make MWAIT available to guests in cloud
environments. My interest is 1) allow guests to avoid the IPI and
2) allow the waiting to be in deeper C-states so that other cores
could get the benefit of turbo-boost etc.






We basically want to handle this sequence:

 monitor(monitor_address);
 if (*monitor_address == base_value)
  mwaitx(max_delay);

Emulation seems problematic because, AFAICS this would happen:

 guest   hypervisor
 =   

 monitor(monitor_address);
 vmexit  ===>monitor(monitor_address)
 if (*monitor_address == base_value)
  mwait();
   vmexit>   mwait()

There's a context switch back to the guest in this sequence which seems
problematic. Both the AMD and Intel specs list system calls and
far calls as events which would lead to the MWAIT being woken up:
"Voluntary transitions due to fast system call and far calls
(occurring prior to issuing MWAIT but after setting the monitor)".


We could do this instead:

 guest   hypervisor
 =   

 monitor(monitor_address);
 vmexit  ===>cache monitor_address
 if (*monitor_address == base_value)
  mwait();
   vmexit>  monitor(monitor_address)
mwait()

But, this would miss the "if (*monitor_address == base_value)" check in
the host which is problematic if *monitor_address changed simultaneously
when monitor was executed.
(Similar problem if we cache both the monitor_address and
*monitor_address.)


So, AFAICS, the only thing that would work is the guest offloading the
whole PV-MWAIT operation.

AFAICS, that could be a paravirt operation which needs three parameters:
(monitor_address, base_value, max_delay.)

This would allow the guest to offload this whole operation to
the host:
 monitor(monitor_address);
 if (*monitor_address == base_value)
  mwaitx(max_delay);

I'm guessing you are thinking on similar lines?


Sort of: only trying to avoid the IPI to wake a remote vCPU.

Problem is that MWAIT works only on a contiguous range
of bits in memory (512 bits max on current CPUs).

So if you execute mwait on the host on behalf of the guest,
the region of memory monitored must include both host
and guest bits.

Yeah, an MWAITv would have come pretty handy here ;).

My idea of PV-MWAIT didn't include waiting on behalf of the host. I
was thinking of waiting in the host but exclusively on behalf of the
guest, until the guest is woken up or when it's time-quanta expires.

Waiting on behalf of both the guest and the host would clearly be better.

If we can do mwait for both the guest and host (say they share a 512
bit region), then the host will need some protection from the guest.
Maybe the waking guest-thread could just do a hypercall to wake up
the remote vCPU? Or maybe it could poke the monitored region,
but that is handled as a special page-fault?
The hypercall-to-wake would also allow us to move guest-threads across
CPUs. That said, I'm not sure how expensive either of these would be.

Assuming host/guest can share a monitored region safely, the host's
idle could monitor some region other than its &thread_info->flags.
Maybe we could setup a mwait notifier with a percpu waiting area which
could be registered by idle, guests etc.

Though on second thoughts, if the remote thread will do a
hypercall/page-fault then the handling could just as easily be: mark
the guest's remote thread runnable and set the resched bit.






High level semantics: If the CPU doesn't have any runnable threads, then
we actually do this version of PV-MWAIT -- arming a timer if necessary
so we only sleep until the time-slice expires or the MWAIT max_delay does.


That would kill the sched_wake_idle_without_ipi optimization for the
host.

Ye

Re: [Xen-devel] [RFC PATCH 04/16] x86/xen: hypercall support for xenhost_t

2019-06-14 Thread Ankur Arora

On 2019-06-12 2:15 p.m., Andrew Cooper wrote:

On 09/05/2019 18:25, Ankur Arora wrote:

Allow for different hypercall implementations for different xenhost types.
Nested xenhost, which has two underlying xenhosts, can use both
simultaneously.

The hypercall macros (HYPERVISOR_*) implicitly use the default xenhost.x
A new macro (hypervisor_*) takes xenhost_t * as a parameter and does the
right thing.

TODO:
   - Multicalls for now assume the default xenhost
   - xen_hypercall_* symbols are only generated for the default xenhost.

Signed-off-by: Ankur Arora 


Again, what is the hypervisor nesting and/or guest layout here?

Two hypervisors, L0 and L1, and the guest is a child of the L1
hypervisor but could have PV devices attached to both L0 and L1
hypervisors.



I can't think of any case where a single piece of software can
legitimately have two hypercall pages, because if it has one working
one, it is by definition a guest, and therefore not privileged enough to
use the outer one.

Depending on which hypercall page is used, the hypercall would
(eventually) land in the corresponding hypervisor.

Juergen elsewhere pointed out proxying hypercalls is a better approach,
so I'm not really considering this any more but, given this layout, and
assuming that the hypercall pages could be encoded differently would it
still not work?

Ankur



~Andrew

___
Xen-devel mailing list
xen-de...@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel





Re: [RFC PATCH 09/16] xen/evtchn: support evtchn in xenhost_t

2019-06-16 Thread Ankur Arora

On 2019-06-14 5:04 a.m., Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Largely mechanical patch that adds a new param, xenhost_t * to the
evtchn interfaces. The evtchn port instead of being domain unique, is
now scoped to xenhost_t.

As part of upcall handling we now look at all the xenhosts and, for
evtchn_2l, the xenhost's shared_info and vcpu_info. Other than this
event handling is largley unchanged.

Note that the IPI, timer, VIRQ, FUNCTION, PMU etc vectors remain
attached to xh_default. Only interdomain evtchns are allowable as
xh_remote.


I'd do only the interface changes for now (including evtchn FIFO).

Looking at this patch again, it seems to me that it would be best to
limit the interface change (to take the xenhost_t * parameter) only to
bind_interdomain_*. That also happily limits the change to the drivers/
subtree.



The main difference will be how to call the hypervisor for sending an
event (either direct or via a passthrough-hypercall).

Yeah, though, this would depend on how the evtchns are mapped (if it's
the L1-Xen which is responsible for mapping the evtchn on behalf of the 
L0-Xen, then notify_remote_via_evtchn() could just stay the same.)

Still, I'll add a send interface (perhaps just an inline function) to
the xenhost interface for this.

Ankur




Juergen




Re: [RFC PATCH 07/16] x86/xen: make vcpu_info part of xenhost_t

2019-06-16 Thread Ankur Arora

On 2019-06-14 4:53 a.m., Juergen Gross wrote:

On 09.05.19 19:25, Ankur Arora wrote:

Abstract out xen_vcpu_id probing via (*probe_vcpu_id)(). Once that is
availab,e the vcpu_info registration happens via the VCPUOP hypercall.

Note that for the nested case, there are two vcpu_ids, and two vcpu_info
areas, one each for the default xenhost and the remote xenhost.
The vcpu_info is used via pv_irq_ops, and evtchn signaling.

The other VCPUOP hypercalls are used for management (and scheduling)
which is expected to be done purely in the default hypervisor.
However, scheduling of L1-guest does imply L0-Xen-vcpu_info switching,
which might mean that the remote hypervisor needs some visibility
into related events/hypercalls in the default hypervisor.


Another candidate for dropping due to layering violation, I guess.

Yeah, a more narrowly tailored interface, where perhaps the L1-Xen
maps events for L0-Xen makes sense.
Also, just realized that given that L0-Xen has no control over
scheduling of L1-Xen's guests (some of which it might want to
send events to), it makes sense for L1-Xen to have some state
for guest evtchns which pertain to L0-Xen.


Ankur




Juergen




Re: [Xen-devel] [PATCH 0/5] xen/pvh*: Support > 32 VCPUs at restore

2017-06-08 Thread Ankur Arora

On 2017-06-08 03:53 PM, Konrad Rzeszutek Wilk wrote:

On Thu, Jun 08, 2017 at 10:28:15AM +0200, Juergen Gross wrote:

On 03/06/17 02:05, Ankur Arora wrote:

This patch series fixes a bunch of issues in the xen_vcpu setup
logic.

Simplify xen_vcpu related code: code refactoring in advance of the
rest of the patch series.

Support > 32 VCPUs at restore: unify all vcpu restore logic in
xen_vcpu_restore() and support > 32 VCPUs for PVH*.

Remove vcpu info placement from restore (!SMP): some pv_ops are
marked RO after init so lets not redo xen_setup_vcpu_info_placement
at restore.

Handle xen_vcpu_setup() failure in hotplug: handle vcpu_info
registration failures by propagating them from the cpuhp-prepare
callback back up to the cpuhp logic.

Handle xen_vcpu_setup() failure at boot: pull CPUs (> MAX_VIRT_CPUS)
down if we fall back to xen_have_vcpu_info_placement = 0.

Tested with various combinations of PV/PVHv2/PVHVM save/restore
and cpu-hotadd-hotremove. Also tested by simulating failure in
VCPUOP_register_vcpu_info.

Please review.


Just a question regarding the sequence of tags (Reviewed-by: and
Signed-off-by:) in the patches:

It seems a little bit odd to have the Reviewed-by: tag before the
S-o-b: tag. This suggests the review was done before you wrote the
patches, which is hard to believe. :-)

Heh :). As Konrad surmises, I was unsure of the order and manually
ordered them to comport with Linux style. (Now that I see arch/x86/xen/,
I see that Xen puts them in time-order.)

Happy to reorder in case of V2.

Ankur



That is how the Linux orders the tags, just do 'git log' and you
will see that pattern >>

So please reorder the tags in future patches to be in their logical
sequence.


While Xen orders it in the other order (SoB first, then Reviewed-by).



I can fix this up in this series in case there is no need for V2.


Juergen



Ankur Arora (5):
   xen/vcpu: Simplify xen_vcpu related code
   xen/pvh*: Support > 32 VCPUs at domain restore
   xen/pv: Fix OOPS on restore for a PV, !SMP domain
   xen/vcpu: Handle xen_vcpu_setup() failure in hotplug
   xen/vcpu: Handle xen_vcpu_setup() failure at boot

  arch/x86/xen/enlighten.c | 154 +++
  arch/x86/xen/enlighten_hvm.c |  33 --
  arch/x86/xen/enlighten_pv.c  |  87 +++-
  arch/x86/xen/smp.c   |  31 +
  arch/x86/xen/smp.h   |   2 +
  arch/x86/xen/smp_hvm.c   |  14 +++-
  arch/x86/xen/smp_pv.c|   6 +-
  arch/x86/xen/suspend_hvm.c   |  11 +---
  arch/x86/xen/xen-ops.h   |   3 +-
  include/xen/xen-ops.h|   2 +
  10 files changed, 218 insertions(+), 125 deletions(-)




___
Xen-devel mailing list
xen-de...@lists.xen.org
https://lists.xen.org/xen-devel


[PATCH 1/5] xen/vcpu: Simplify xen_vcpu related code

2017-06-02 Thread Ankur Arora
Largely mechanical changes to aid unification of xen_vcpu_restore()
logic for PV, PVH and PVHVM.

xen_vcpu_setup(): the only change in logic is that clamp_max_cpus()
is now handled inside the "if (!xen_have_vcpu_info_placement)" block.

xen_vcpu_restore(): code movement from enlighten_pv.c to enlighten.c.

xen_vcpu_info_reset(): pulls together all the code where xen_vcpu
is set to default.

Reviewed-by: Boris Ostrovsky 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten.c | 101 +++
 arch/x86/xen/enlighten_hvm.c |   6 +--
 arch/x86/xen/enlighten_pv.c  |  47 +---
 arch/x86/xen/smp_hvm.c   |   3 +-
 arch/x86/xen/xen-ops.h   |   1 +
 5 files changed, 89 insertions(+), 69 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a5ffcbb20cc0..96b745e3f56c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -106,6 +106,35 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
 }
 
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   bool other_cpu = (cpu != smp_processor_id());
+   bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
+   NULL);
+
+   if (other_cpu && is_up &&
+   HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
+   BUG();
+
+   xen_setup_runstate_info(cpu);
+
+   if (xen_have_vcpu_info_placement)
+   xen_vcpu_setup(cpu);
+
+   if (other_cpu && is_up &&
+   HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
+   BUG();
+   }
+}
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@@ -114,6 +143,17 @@ static void clamp_max_cpus(void)
 #endif
 }
 
+void xen_vcpu_info_reset(int cpu)
+{
+   if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
+   per_cpu(xen_vcpu, cpu) =
+   &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+   } else {
+   /* Set to NULL so that if somebody accesses it we get an OOPS */
+   per_cpu(xen_vcpu, cpu) = NULL;
+   }
+}
+
 void xen_vcpu_setup(int cpu)
 {
struct vcpu_register_vcpu_info info;
@@ -137,40 +177,45 @@ void xen_vcpu_setup(int cpu)
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
return;
}
-   if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
-   per_cpu(xen_vcpu, cpu) =
-   &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+
+   xen_vcpu_info_reset(cpu);
+
+   if (xen_have_vcpu_info_placement) {
+   vcpup = &per_cpu(xen_vcpu_info, cpu);
+   info.mfn = arbitrary_virt_to_mfn(vcpup);
+   info.offset = offset_in_page(vcpup);
+
+   /*
+* Check to see if the hypervisor will put the vcpu_info
+* structure where we want it, which allows direct access via
+* a percpu-variable.
+* N.B. This hypercall can _only_ be called once per CPU.
+* Subsequent calls will error out with -EINVAL. This is due to
+* the fact that hypervisor has no unregister variant and this
+* hypercall does not allow to over-write info.mfn and
+* info.offset.
+*/
+   err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
+xen_vcpu_nr(cpu), &info);
+
+   if (err) {
+   pr_warn_once("register_vcpu_info failed: cpu=%d 
err=%d\n",
+cpu, err);
+   xen_have_vcpu_info_placement = 0;
+   } else {
+   /*
+* This cpu is using the registered vcpu info, even if
+* later ones fail to.
+*/
+   per_cpu(xen_vcpu, cpu) = vcpup;
+   }
+   }
 
if (!xen_have_vcpu_info_placement) {
if (cpu >= MAX_VIRT_CPUS)
clamp_max_cpus();
return;
}
-
-   vcpup = &per_cpu(xen_vcpu_info, cpu);
-   info.mfn = arbitrary_virt_to_mfn(vcpup);
-   info.offset = offset_in_page(vcpup);
-
-   /* Check to see if the hypervisor will put the vcpu_info
-  structure where we want it, which allows direct access via
-  a percpu-variable.
-  N.B. This hypercall can _only_ be called once per CPU. Subsequent
-  calls will error out with -EINV

[PATCH 2/5] xen/pvh*: Support > 32 VCPUs at domain restore

2017-06-02 Thread Ankur Arora
When Xen restores a PVHVM or PVH guest, its shared_info only holds
up to 32 CPUs. The hypercall VCPUOP_register_vcpu_info allows
us to setup per-page areas for VCPUs. This means we can boot
PVH* guests with more than 32 VCPUs. During restore the per-cpu
structure is allocated freshly by the hypervisor (vcpu_info_mfn is
set to INVALID_MFN) so that the newly restored guest can make a
VCPUOP_register_vcpu_info hypercall.

However, we end up triggering this condition in Xen:
/* Run this command on yourself or on other offline VCPUS. */
 if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) )

which means we are unable to setup the per-cpu VCPU structures
for running VCPUS. The Linux PV code paths makes this work by
iterating over cpu_possible in xen_vcpu_restore() with:

 1) is target CPU up (VCPUOP_is_up hypercall?)
 2) if yes, then VCPUOP_down to pause it
 3) VCPUOP_register_vcpu_info
 4) if it was down, then VCPUOP_up to bring it back up

With Xen commit 192df6f9122d ("xen/x86: allow HVM guests to use
hypercalls to bring up vCPUs") this is available for non-PV guests.
As such first check if VCPUOP_is_up is actually possible before
trying this dance.

As most of this dance code is done already in xen_vcpu_restore()
let's make it callable on PV, PVH and PVHVM.

Based-on-patch-by: Konrad Wilk 
Reviewed-by: Boris Ostrovsky 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten.c | 45 +++-
 arch/x86/xen/enlighten_hvm.c | 20 +++-
 arch/x86/xen/smp_hvm.c   | 10 ++
 arch/x86/xen/suspend_hvm.c   | 11 +++
 include/xen/xen-ops.h|  2 ++
 5 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 96b745e3f56c..276cc21619ec 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -106,6 +106,21 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
 }
 
+static void xen_vcpu_setup_restore(int cpu)
+{
+   /* Any per_cpu(xen_vcpu) is stale, so reset it */
+   xen_vcpu_info_reset(cpu);
+
+   /*
+* For PVH and PVHVM, setup online VCPUs only. The rest will
+* be handled by hotplug.
+*/
+   if (xen_pv_domain() ||
+   (xen_hvm_domain() && cpu_online(cpu))) {
+   xen_vcpu_setup(cpu);
+   }
+}
+
 /*
  * On restore, set the vcpu placement up again.
  * If it fails, then we're in a bad state, since
@@ -117,17 +132,23 @@ void xen_vcpu_restore(void)
 
for_each_possible_cpu(cpu) {
bool other_cpu = (cpu != smp_processor_id());
-   bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu),
-   NULL);
+   bool is_up;
+
+   if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID)
+   continue;
+
+   /* Only Xen 4.5 and higher support this. */
+   is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up,
+  xen_vcpu_nr(cpu), NULL) > 0;
 
if (other_cpu && is_up &&
HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
BUG();
 
-   xen_setup_runstate_info(cpu);
+   if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+   xen_setup_runstate_info(cpu);
 
-   if (xen_have_vcpu_info_placement)
-   xen_vcpu_setup(cpu);
+   xen_vcpu_setup_restore(cpu);
 
if (other_cpu && is_up &&
HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
@@ -163,11 +184,11 @@ void xen_vcpu_setup(int cpu)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
/*
-* This path is called twice on PVHVM - first during bootup via
-* smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
-* hotplugged: cpu_up -> xen_hvm_cpu_notify.
-* As we can only do the VCPUOP_register_vcpu_info once lets
-* not over-write its result.
+* This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu)
+* and at restore (xen_vcpu_restore). Also called for hotplugged
+* VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm).
+* However, the hypercall can only be done once (see below) so if a VCPU
+* is offlined and comes back online then let's not redo the hypercall.
 *
 * For PV it is called during restore (xen_vcpu_restore) and bootup
 * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
@@ -178,8 +199,6 @@ void xen_vcpu_setup(int cpu)
return;
}
 
-   xen_vcpu_info_reset(cpu);
-
if (xen_have_vcpu_info_placement) {
vcpup = &per_cpu(xen_vcpu_info, cpu);
 

[PATCH 4/5] xen/vcpu: Handle xen_vcpu_setup() failure in hotplug

2017-06-02 Thread Ankur Arora
The hypercall VCPUOP_register_vcpu_info can fail. This failure is
handled by making per_cpu(xen_vcpu, cpu) point to its shared_info
slot and those without one (cpu >= MAX_VIRT_CPUS) be NULL.

For PVH/PVHVM, this is not enough, because we also need to pull
these VCPUs out of circulation.

Fix for PVH/PVHVM: on registration failure in the cpuhp prepare
callback (xen_cpu_up_prepare_hvm()), return an error to the cpuhp
state-machine so it can fail the CPU init.

Fix for PV: the registration happens before smp_init(), so, in the
failure case we clamp setup_max_cpus and limit the number of VCPUs
that smp_init() will bring-up to MAX_VIRT_CPUS.
This is functionally correct but it makes the code a bit simpler
if we get rid of this explicit clamping: for VCPUs that don't have
valid xen_vcpu, fail the CPU init in the cpuhp prepare callback
(xen_cpu_up_prepare_pv()).

Reviewed-by: Boris Ostrovsky 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten.c | 46 +---
 arch/x86/xen/enlighten_hvm.c |  9 +
 arch/x86/xen/enlighten_pv.c  | 14 +-
 arch/x86/xen/xen-ops.h   |  2 +-
 4 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 276cc21619ec..0e7ef69e8531 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -106,8 +106,10 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
return rc >= 0 ? 0 : rc;
 }
 
-static void xen_vcpu_setup_restore(int cpu)
+static int xen_vcpu_setup_restore(int cpu)
 {
+   int rc = 0;
+
/* Any per_cpu(xen_vcpu) is stale, so reset it */
xen_vcpu_info_reset(cpu);
 
@@ -117,8 +119,10 @@ static void xen_vcpu_setup_restore(int cpu)
 */
if (xen_pv_domain() ||
(xen_hvm_domain() && cpu_online(cpu))) {
-   xen_vcpu_setup(cpu);
+   rc = xen_vcpu_setup(cpu);
}
+
+   return rc;
 }
 
 /*
@@ -128,7 +132,7 @@ static void xen_vcpu_setup_restore(int cpu)
  */
 void xen_vcpu_restore(void)
 {
-   int cpu;
+   int cpu, rc;
 
for_each_possible_cpu(cpu) {
bool other_cpu = (cpu != smp_processor_id());
@@ -148,22 +152,25 @@ void xen_vcpu_restore(void)
if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_runstate_info(cpu);
 
-   xen_vcpu_setup_restore(cpu);
-
-   if (other_cpu && is_up &&
+   rc = xen_vcpu_setup_restore(cpu);
+   if (rc)
+   pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
+   "System will hang.\n", cpu, rc);
+   /*
+* In case xen_vcpu_setup_restore() fails, do not bring up the
+* VCPU. This helps us avoid the resulting OOPS when the VCPU
+* accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
+* Note that this does not improve the situation much -- now the
+* VM hangs instead of OOPSing -- with the VCPUs that did not
+* fail, spinning in stop_machine(), waiting for the failed
+* VCPUs to come up.
+*/
+   if (other_cpu && is_up && (rc == 0) &&
HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
BUG();
}
 }
 
-static void clamp_max_cpus(void)
-{
-#ifdef CONFIG_SMP
-   if (setup_max_cpus > MAX_VIRT_CPUS)
-   setup_max_cpus = MAX_VIRT_CPUS;
-#endif
-}
-
 void xen_vcpu_info_reset(int cpu)
 {
if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
@@ -175,7 +182,7 @@ void xen_vcpu_info_reset(int cpu)
}
 }
 
-void xen_vcpu_setup(int cpu)
+int xen_vcpu_setup(int cpu)
 {
struct vcpu_register_vcpu_info info;
int err;
@@ -196,7 +203,7 @@ void xen_vcpu_setup(int cpu)
 */
if (xen_hvm_domain()) {
if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
-   return;
+   return 0;
}
 
if (xen_have_vcpu_info_placement) {
@@ -230,11 +237,10 @@ void xen_vcpu_setup(int cpu)
}
}
 
-   if (!xen_have_vcpu_info_placement) {
-   if (cpu >= MAX_VIRT_CPUS)
-   clamp_max_cpus();
+   if (!xen_have_vcpu_info_placement)
xen_vcpu_info_reset(cpu);
-   }
+
+   return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
 }
 
 void xen_reboot(int reason)
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index ba1afadb2512..13b5fa1a211c 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -89,7 +89,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs)
 
 static int xen_cpu_up_prepare_hvm(unsigned int cpu)
 {
-   int rc;

[PATCH 5/5] xen/vcpu: Handle xen_vcpu_setup() failure at boot

2017-06-02 Thread Ankur Arora
On PVH, PVHVM, at failure in the VCPUOP_register_vcpu_info hypercall
we limit the number of cpus to to MAX_VIRT_CPUS. However, if this
failure had occurred for a cpu beyond MAX_VIRT_CPUS, we continue
to function with > MAX_VIRT_CPUS.

This leads to problems at the next save/restore cycle when there
are > MAX_VIRT_CPUS threads going into stop_machine() but coming
back up there's valid state for only the first MAX_VIRT_CPUS.

This patch pulls the excess CPUs down via cpu_down().

Reviewed-by: Boris Ostrovsky 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/smp.c | 31 +++
 arch/x86/xen/smp.h |  2 ++
 arch/x86/xen/smp_hvm.c |  1 +
 arch/x86/xen/smp_pv.c  |  6 +-
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 82ac611f2fc1..e7f02eb73727 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -1,4 +1,5 @@
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu)
return rc;
 }
 
+void __init xen_smp_cpus_done(unsigned int max_cpus)
+{
+   int cpu, rc, count = 0;
+
+   if (xen_hvm_domain())
+   native_smp_cpus_done(max_cpus);
+
+   if (xen_have_vcpu_info_placement)
+   return;
+
+   for_each_online_cpu(cpu) {
+   if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
+   continue;
+
+   rc = cpu_down(cpu);
+
+   if (rc == 0) {
+   /*
+* Reset vcpu_info so this cpu cannot be onlined again.
+*/
+   xen_vcpu_info_reset(cpu);
+   count++;
+   } else {
+   pr_warn("%s: failed to bring CPU %d down, error %d\n",
+   __func__, cpu, rc);
+   }
+   }
+   WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
+}
+
 void xen_smp_send_reschedule(int cpu)
 {
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 8ebb6acca64a..87d3c76cba37 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu);
 int xen_smp_intr_init_pv(unsigned int cpu);
 void xen_smp_intr_free_pv(unsigned int cpu);
 
+void xen_smp_cpus_done(unsigned int max_cpus);
+
 void xen_smp_send_reschedule(int cpu);
 void xen_smp_send_call_function_ipi(const struct cpumask *mask);
 void xen_smp_send_call_function_single_ipi(int cpu);
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index 6c8a805819ff..fd60abedf658 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -71,4 +71,5 @@ void __init xen_hvm_smp_init(void)
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = 
xen_smp_send_call_function_single_ipi;
smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+   smp_ops.smp_cpus_done = xen_smp_cpus_done;
 }
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index aae32535f4ec..1ea598e5f030 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -371,10 +371,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct 
task_struct *idle)
return 0;
 }
 
-static void xen_pv_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 static int xen_pv_cpu_disable(void)
 {
@@ -469,7 +465,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void 
*dev_id)
 static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
-   .smp_cpus_done = xen_pv_smp_cpus_done,
+   .smp_cpus_done = xen_smp_cpus_done,
 
.cpu_up = xen_pv_cpu_up,
.cpu_die = xen_pv_cpu_die,
-- 
2.7.4



[PATCH 0/5] xen/pvh*: Support > 32 VCPUs at restore

2017-06-02 Thread Ankur Arora
This patch series fixes a bunch of issues in the xen_vcpu setup
logic.

Simplify xen_vcpu related code: code refactoring in advance of the
rest of the patch series.

Support > 32 VCPUs at restore: unify all vcpu restore logic in
xen_vcpu_restore() and support > 32 VCPUs for PVH*.

Remove vcpu info placement from restore (!SMP): some pv_ops are
marked RO after init so lets not redo xen_setup_vcpu_info_placement
at restore.

Handle xen_vcpu_setup() failure in hotplug: handle vcpu_info
registration failures by propagating them from the cpuhp-prepare
callback back up to the cpuhp logic.

Handle xen_vcpu_setup() failure at boot: pull CPUs (> MAX_VIRT_CPUS)
down if we fall back to xen_have_vcpu_info_placement = 0.

Tested with various combinations of PV/PVHv2/PVHVM save/restore
and cpu-hotadd-hotremove. Also tested by simulating failure in
VCPUOP_register_vcpu_info.

Please review.

Ankur Arora (5):
  xen/vcpu: Simplify xen_vcpu related code
  xen/pvh*: Support > 32 VCPUs at domain restore
  xen/pv: Fix OOPS on restore for a PV, !SMP domain
  xen/vcpu: Handle xen_vcpu_setup() failure in hotplug
  xen/vcpu: Handle xen_vcpu_setup() failure at boot

 arch/x86/xen/enlighten.c | 154 +++
 arch/x86/xen/enlighten_hvm.c |  33 --
 arch/x86/xen/enlighten_pv.c  |  87 +++-
 arch/x86/xen/smp.c   |  31 +
 arch/x86/xen/smp.h   |   2 +
 arch/x86/xen/smp_hvm.c   |  14 +++-
 arch/x86/xen/smp_pv.c|   6 +-
 arch/x86/xen/suspend_hvm.c   |  11 +---
 arch/x86/xen/xen-ops.h   |   3 +-
 include/xen/xen-ops.h|   2 +
 10 files changed, 218 insertions(+), 125 deletions(-)

-- 
2.7.4



[PATCH 3/5] xen/pv: Fix OOPS on restore for a PV, !SMP domain

2017-06-02 Thread Ankur Arora
If CONFIG_SMP is disabled, xen_setup_vcpu_info_placement() is called from
xen_setup_shared_info(). This is fine as far as boot goes, but it means
that we also call it in the restore path. This results in an OOPS
because we assign to pv_mmu_ops.read_cr2 which is __ro_after_init.

Also, though less problematically, this means we call xen_vcpu_setup()
twice at restore -- once from the vcpu info placement call and the
second time from xen_vcpu_restore().

Fix by calling xen_setup_vcpu_info_placement() at boot only.

Reviewed-by: Boris Ostrovsky 
Signed-off-by: Ankur Arora 
---
 arch/x86/xen/enlighten_pv.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f51e48299692..29cad193db53 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -938,23 +938,27 @@ void xen_setup_shared_info(void)
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
 
-#ifndef CONFIG_SMP
-   /* In UP this is as good a place as any to set up shared info */
-   xen_setup_vcpu_info_placement();
-#endif
-
xen_setup_mfn_list_list();
 
-   /*
-* Now that shared info is set up we can start using routines that
-* point to pvclock area.
-*/
-   if (system_state == SYSTEM_BOOTING)
+   if (system_state == SYSTEM_BOOTING) {
+#ifndef CONFIG_SMP
+   /*
+* In UP this is as good a place as any to set up shared info.
+* Limit this to boot only, at restore vcpu setup is done via
+* xen_vcpu_restore().
+*/
+   xen_setup_vcpu_info_placement();
+#endif
+   /*
+* Now that shared info is set up we can start using routines
+* that point to pvclock area.
+*/
xen_init_time_ops();
+   }
 }
 
 /* This is called once we have the cpu_possible_mask */
-void xen_setup_vcpu_info_placement(void)
+void __ref xen_setup_vcpu_info_placement(void)
 {
int cpu;
 
-- 
2.7.4