Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"
Steven Rostedt writes: > On Tue, 7 Nov 2023 13:56:53 -0800 > Ankur Arora wrote: > >> This reverts commit e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8. >> >> Note that removing this commit reintroduces "live patches failing to >> complete within a reasonable amount of time due to CPU-bound kthreads." >> >> Unfortunately this fix depends quite critically on PREEMPT_DYNAMIC and >> existence of cond_resched() so this will need an alternate fix. >> > > Then it would probably be a good idea to Cc the live patching maintainers! Indeed. Could have sworn that I had. But clearly not. Apologies and thanks for adding them. >> Signed-off-by: Ankur Arora >> --- >> include/linux/livepatch.h | 1 - >> include/linux/livepatch_sched.h | 29 - >> include/linux/sched.h | 20 ++ >> kernel/livepatch/core.c | 1 - >> kernel/livepatch/transition.c | 107 +--- >> kernel/sched/core.c | 64 +++ >> 6 files changed, 28 insertions(+), 194 deletions(-) >> delete mode 100644 include/linux/livepatch_sched.h >> >> diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h >> index 9b9b38e89563..293e29960c6e 100644 >> --- a/include/linux/livepatch.h >> +++ b/include/linux/livepatch.h >> @@ -13,7 +13,6 @@ >> #include >> #include >> #include >> -#include >> >> #if IS_ENABLED(CONFIG_LIVEPATCH) >> >> diff --git a/include/linux/livepatch_sched.h >> b/include/linux/livepatch_sched.h >> deleted file mode 100644 >> index 013794fb5da0.. >> --- a/include/linux/livepatch_sched.h >> +++ /dev/null >> @@ -1,29 +0,0 @@ >> -/* SPDX-License-Identifier: GPL-2.0-or-later */ >> -#ifndef _LINUX_LIVEPATCH_SCHED_H_ >> -#define _LINUX_LIVEPATCH_SCHED_H_ >> - >> -#include >> -#include >> - >> -#ifdef CONFIG_LIVEPATCH >> - >> -void __klp_sched_try_switch(void); >> - >> -#if !defined(CONFIG_PREEMPT_DYNAMIC) || >> !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) >> - >> -DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); >> - >> -static __always_inline void klp_sched_try_switch(void) >> -{ >> -if (static_branch_unlikely(&klp_sched_try_switch_key)) >> -__klp_sched_try_switch(); >> -} >> - >> -#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ >> - >> -#else /* !CONFIG_LIVEPATCH */ >> -static inline void klp_sched_try_switch(void) {} >> -static inline void __klp_sched_try_switch(void) {} >> -#endif /* CONFIG_LIVEPATCH */ >> - >> -#endif /* _LINUX_LIVEPATCH_SCHED_H_ */ >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index 5bdf80136e42..c5b0ef1ecfe4 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -36,7 +36,6 @@ >> #include >> #include >> #include >> -#include >> #include >> >> /* task_struct member predeclarations (sorted alphabetically): */ >> @@ -2087,9 +2086,6 @@ extern int __cond_resched(void); >> >> #if defined(CONFIG_PREEMPT_DYNAMIC) && >> defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) >> >> -void sched_dynamic_klp_enable(void); >> -void sched_dynamic_klp_disable(void); >> - >> DECLARE_STATIC_CALL(cond_resched, __cond_resched); >> >> static __always_inline int _cond_resched(void) >> @@ -2098,7 +2094,6 @@ static __always_inline int _cond_resched(void) >> } >> >> #elif defined(CONFIG_PREEMPT_DYNAMIC) && >> defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) >> - >> extern int dynamic_cond_resched(void); >> >> static __always_inline int _cond_resched(void) >> @@ -2106,25 +2101,20 @@ static __always_inline int _cond_resched(void) >> return dynamic_cond_resched(); >> } >> >> -#else /* !CONFIG_PREEMPTION */ >> +#else >> >> static inline int _cond_resched(void) >> { >> -klp_sched_try_switch(); >> return __cond_resched(); >> } >> >> -#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ >> +#endif /* CONFIG_PREEMPT_DYNAMIC */ >> >> -#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ >> +#else >> >> -static inline int _cond_resched(void) >> -{ >> -klp_sched_try_switch(); >> -return 0; >> -} >> +static inline int _cond_resched(void) { return 0; } >> >> -#endif /* !CONFIG_PREEMPTION || CONFIG_PRE
Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"
Josh Poimboeuf writes: > On Thu, Nov 09, 2023 at 12:31:47PM -0500, Steven Rostedt wrote: >> On Thu, 9 Nov 2023 09:26:37 -0800 >> Josh Poimboeuf wrote: >> >> > On Tue, Nov 07, 2023 at 06:16:09PM -0500, Steven Rostedt wrote: >> > > On Tue, 7 Nov 2023 13:56:53 -0800 >> > > Ankur Arora wrote: >> > > >> > > > This reverts commit e3ff7c609f39671d1aaff4fb4a8594e14f3e03f8. >> > > > >> > > > Note that removing this commit reintroduces "live patches failing to >> > > > complete within a reasonable amount of time due to CPU-bound kthreads." >> > > > >> > > > Unfortunately this fix depends quite critically on PREEMPT_DYNAMIC and >> > > > existence of cond_resched() so this will need an alternate fix. >> > >> > We definitely don't want to introduce a regression, something will need >> > to be figured out before removing cond_resched(). >> > >> > We could hook into preempt_schedule_irq(), but that wouldn't work for >> > non-ORC. >> > >> > Another option would be to hook into schedule(). Then livepatch could >> > set TIF_NEED_RESCHED on remaining unpatched tasks. But again if they go >> > through the preemption path then we have the same problem for non-ORC. >> > >> > Worst case we'll need to sprinkle cond_livepatch() everywhere :-/ >> > >> >> I guess I'm not fully understanding what the cond rescheds are for. But >> would an IPI to all CPUs setting NEED_RESCHED, fix it? Yeah. We could just temporarily toggle to full preemption, when NEED_RESCHED_LAZY is always upgraded to NEED_RESCHED which will then send IPIs. > If all livepatch arches had the ORC unwinder, yes. > > The problem is that frame pointer (and similar) unwinders can't reliably > unwind past an interrupt frame. Ah, I wonder if we could just disable the preempt_schedule_irq() path temporarily? Hooking into schedule() alongside something like this: @@ -379,7 +379,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) void irqentry_exit_cond_resched(void) { - if (!preempt_count()) { + if (klp_cond_resched_disable() && !preempt_count()) { The problem would be tasks that don't go through any preemptible sections. -- ankur
Re: [RFC PATCH 07/86] Revert "livepatch,sched: Add livepatch task switching to cond_resched()"
Josh Poimboeuf writes: > On Thu, Nov 09, 2023 at 02:50:48PM -0800, Ankur Arora wrote: >> >> I guess I'm not fully understanding what the cond rescheds are for. But >> >> would an IPI to all CPUs setting NEED_RESCHED, fix it? >> >> Yeah. We could just temporarily toggle to full preemption, when >> NEED_RESCHED_LAZY is always upgraded to NEED_RESCHED which will >> then send IPIs. >> >> > If all livepatch arches had the ORC unwinder, yes. >> > >> > The problem is that frame pointer (and similar) unwinders can't reliably >> > unwind past an interrupt frame. >> >> Ah, I wonder if we could just disable the preempt_schedule_irq() path >> temporarily? Hooking into schedule() alongside something like this: >> >> @@ -379,7 +379,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs >> *regs) >> >> void irqentry_exit_cond_resched(void) >> { >> - if (!preempt_count()) { >> + if (klp_cond_resched_disable() && !preempt_count()) { >> >> The problem would be tasks that don't go through any preemptible >> sections. > > Let me back up a bit and explain what klp is trying to do. > > When a livepatch is applied, klp needs to unwind all the tasks, > preferably within a reasonable amount of time. > > We can't unwind task A from task B while task A is running, since task A > could be changing the stack during the unwind. So task A needs to be > blocked or asleep. The only exception to that is if the unwind happens > in the context of task A itself. > The problem we were seeing was CPU-bound kthreads (e.g., vhost_worker) > not getting patched within a reasonable amount of time. We fixed it by > hooking the klp unwind into cond_resched() so it can unwind from the > task itself. Right, so the task calls schedule() itself via cond_resched() and that works. If the task schedules out by calling preempt_enable() that presumably works as well. So, that leaves two paths where we can't unwind: 1. a task never entering or leaving preemptible sections 2. or, a task which gets preempted in irqentry_exit_cond_resched() This we could disable temporarily. > It only worked because we had a non-preempted hook (because non-ORC > unwinders can't unwind reliably through preemption) which called klp to > unwind from the context of the task. > > Without something to hook into, we have a problem. We could of course > hook into schedule(), but if the kthread never calls schedule() from a > non-preempted context then it still doesn't help. Yeah agreed. The first one is a problem. And, that's a problem with the removal of cond_resched() generally. Because the way to fix case 1 was typically to add a cond_resched() when softlockups were seen or in code review. -- ankur
Re: vfio-pci: protect remap_pfn_range() from simultaneous calls
Hi Bharat, Can you test the patch below to see if it works for you? Also could you add some more detail to your earlier description of the bug? In particular, AFAICS you are using ODP (-DPDK?) with multiple threads touching this region. From your stack, it looks like the fault was user-space generated, and I'm guessing you were not using the VFIO_IOMMU_MAP_DMA. Ankur -- >8 -- Subject: [PATCH] vfio-pci: protect io_remap_pfn_range() from simultaneous calls vfio_pci_mmap_fault() maps the complete VMA on fault. With concurrent faults, this would result in multiple calls to io_remap_pfn_range(), where it would hit a BUG_ON(!pte_none(*pte)) in remap_pte_range(). (It would also link the same VMA multiple times in vdev->vma_list but given the BUG_ON that is less serious.) Normally, however, this won't happen -- at least with vfio_iommu_type1 -- the VFIO_IOMMU_MAP_DMA path is protected by iommu->lock. If, however, we are using some kind of parallelization mechanism like this one with ktask under discussion [1], we would hit this. Even if we were doing this serially, given that vfio-pci remaps a larger extent than strictly necessary it should internally enforce coherence of its data structures. Handle this by using the VMA's presence in the vdev->vma_list as indicative of a fully mapped VMA and returning success early to all but the first VMA fault. Note that this is clearly optimstic given that the mapping is ongoing, and might mean that the caller sees more faults until the remap is done. [1] https://lore.kernel.org/linux-mm/20181105145141.6f993...@w520.home/ Signed-off-by: Ankur Arora --- drivers/vfio/pci/vfio_pci.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 65e7e6b44578..b9f509863db1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1573,6 +1573,11 @@ static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, { struct vfio_pci_mmap_vma *mmap_vma; + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + if (mmap_vma->vma == vma) + return 1; + } + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); if (!mmap_vma) return -ENOMEM; @@ -1613,6 +1618,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct vfio_pci_device *vdev = vma->vm_private_data; vm_fault_t ret = VM_FAULT_NOPAGE; + int vma_present; mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); @@ -1623,7 +1629,21 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) goto up_out; } - if (__vfio_pci_add_vma(vdev, vma)) { + /* +* __vfio_pci_add_vma() either adds the vma to the vdev->vma_list +* (vma_present == 0), or indicates that the vma is already present +* on the list (vma_present == 1). +* +* Overload the meaning of this flag to also imply that the vma is +* fully mapped. This allows us to serialize the mapping -- ensuring +* that simultaneous faults will not both try to call +* io_remap_pfn_range(). +* +* However, this might mean that callers to which we returned success +* optimistically will see more faults until the remap is complete. +*/ + vma_present = __vfio_pci_add_vma(vdev, vma); + if (vma_present < 0) { ret = VM_FAULT_OOM; mutex_unlock(&vdev->vma_lock); goto up_out; @@ -1631,6 +1651,9 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) mutex_unlock(&vdev->vma_lock); + if (vma_present) + goto up_out; + if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot)) ret = VM_FAULT_SIGBUS; -- 2.29.2
Re: [PATCH RFC 10/39] KVM: x86/xen: support upcall vector
On 2020-12-08 8:08 a.m., David Woodhouse wrote: On Wed, 2020-12-02 at 19:02 +, David Woodhouse wrote: I feel we could just accommodate it as subtype in KVM_XEN_ATTR_TYPE_CALLBACK_VIA. Don't see the adavantage in having another xen attr type. Yeah, fair enough. But kinda have mixed feelings in having kernel handling all event channels ABI, as opposed to only the ones userspace asked to offload. It looks a tad unncessary besides the added gain to VMMs that don't need to care about how the internals of event channels. But performance-wise it wouldn't bring anything better. But maybe, the former is reason enough to consider it. Yeah, we'll see. Especially when it comes to implementing FIFO event channels, I'd rather just do it in one place — and if the kernel does it anyway then it's hardly difficult to hook into that. But I've been about as coherent as I can be in email, and I think we're generally aligned on the direction. I'll do some more experiments and see what I can get working, and what it looks like. So... I did some more typing, and revived our completely userspace based implementation of event channels. I wanted to declare that such was *possible*, and that using the kernel for IPI and VIRQ was just a very desirable optimisation. It looks like Linux doesn't use the per-vCPU upcall vector that you called 'KVM_XEN_CALLBACK_VIA_EVTCHN'. So I'm delivering interrupts via KVM_INTERRUPT as if they were ExtINT ... except I'm not. Because the kernel really does expect that to be an ExtINT from a legacy PIC, and kvm_apic_accept_pic_intr() only returns true if LVT0 is set up for EXTINT and unmasked. I messed around with this hack and increasingly desperate variations on the theme (since this one doesn't cause the IRQ window to be opened to userspace in the first place), but couldn't get anything working: Increasingly desperate variations, about sums up my process as well while trying to get the upcall vector working. --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2380,6 +2380,9 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) r = 1; + /* Shoot me. */ + if (vcpu->arch.pending_external_vector == 243) + r = 1; return r; } Eventually I resorted to delivering the interrupt through the lapic *anyway* (through KVM_SIGNAL_MSI with an MSI message constructed for the appropriate vCPU/vector) and the following hack to auto-EOI: --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2416,7 +2419,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) */ apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { + if (vector == 243 || test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another That works, and now my guest finishes the SMP bringup (and gets as far as waiting on the XenStore implementation that I haven't put back yet). So I think we need at least a tiny amount of support in-kernel for delivering event channel interrupt vectors, even if we wanted to allow for a completely userspace implementation. Unless I'm missing something? I did use the auto_eoi hack as well. So, yeah, I don't see any way of getting around this. Also, IIRC we had eventually gotten rid of the auto_eoi approach because that wouldn't work with APICv. At that point we resorted to direct queuing for vectored callbacks which was a hack that I never grew fond of... I will get on with implementing the in-kernel handling with IRQ routing entries targeting a given { port, vcpu }. And I'm kind of vacillating about whether the mode/vector should be separately configured, or whether they might as well be in the IRQ routing table too, even if it's kind of redundant because it's specified the same for *every* port targeting the same vCPU. I *think* I prefer that redundancy over having a separate configuration mechanism to set the vector for each vCPU. But we'll see what happens when my fingers do the typing... Good luck to your fingers! Ankur
Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page
On 2020-12-01 5:07 a.m., David Woodhouse wrote: On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote: +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + struct shared_info *shared_info; + struct page *page; + + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) + return -EINVAL; + + kvm->arch.xen.shinfo_addr = gfn; + + shared_info = page_to_virt(page); + memset(shared_info, 0, sizeof(struct shared_info)); + kvm->arch.xen.shinfo = shared_info; + return 0; +} + Hm. How come we get to pin the page and directly dereference it every time, while kvm_setup_pvclock_page() has to use kvm_write_guest_cached() instead? So looking at my WIP trees from the time, this is something that we went back and forth on as well with using just a pinned page or a persistent kvm_vcpu_map(). I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page() as shared_info is created early and is not expected to change during the lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map() kvm_vcpu_unmap() dance or do some kind of synchronization. That said, I don't think this code explicitly disallows any updates to shared_info. If that was allowed, wouldn't it have been a much simpler fix for CVE-2019-3016? What am I missing? Agreed. Perhaps, Paolo can chime in with why KVM never uses pinned page and always prefers to do cached mappings instead? Should I rework these to use kvm_write_guest_cached()? kvm_vcpu_map() would be better. The event channel logic does RMW operations on shared_info->vcpu_info. Ankur
Re: [PATCH RFC 02/39] KVM: x86/xen: intercept xen hypercalls if enabled
On 2020-12-01 1:48 a.m., David Woodhouse wrote: On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote: Add a new exit reason for emulator to handle Xen hypercalls. Albeit these are injected only if guest has initialized the Xen hypercall page I've reworked this a little. I didn't like the inconsistency of allowing userspace to provide the hypercall pages even though the ABI is now defined by the kernel and it *has* to be VMCALL/VMMCALL. So I switched it to generate the hypercall page directly from the kernel, just like we do for the Hyper-V hypercall page. I introduced a new flag in the xen_hvm_config struct to enable this behaviour, and advertised it in the KVM_CAP_XEN_HVM return value. I also added the cpl and support for 6-argument hypercalls, and made it check the guest RIP when completing the call as discussed (although I still think that probably ought to be a generic thing). I adjusted the test case from my version of the patch, and added support for actually testing the hypercall page MSR. https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv I'll go through and rebase your patch series at least up to patch 16 and collect them in that tree, then probably post them for real once I've got everything working locally. === From c037c329c8867b219afe2100e383c62e9db7b06d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 13 Jun 2018 09:55:44 -0400 Subject: [PATCH] KVM: x86/xen: intercept xen hypercalls if enabled Add a new exit reason for emulator to handle Xen hypercalls. Since this means KVM owns the ABI, dispense with the facility for the VMM to provide its own copy of the hypercall pages; just fill them in directly using VMCALL/VMMCALL as we do for the Hyper-V hypercall page. This behaviour is enabled by a new INTERCEPT_HCALL flag in the KVM_XEN_HVM_CONFIG ioctl structure, and advertised by the same flag being returned from the KVM_CAP_XEN_HVM check. Add a test case and shift xen_hvm_config() to the nascent xen.c while we're at it. Signed-off-by: Joao Martins Signed-off-by: David Woodhouse --- arch/x86/include/asm/kvm_host.h | 6 + arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/trace.h | 36 + arch/x86/kvm/x86.c| 46 +++--- arch/x86/kvm/xen.c| 140 ++ arch/x86/kvm/xen.h| 21 +++ include/uapi/linux/kvm.h | 19 +++ tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/lib/kvm_util.c| 1 + .../selftests/kvm/x86_64/xen_vmcall_test.c| 123 +++ 10 files changed, 365 insertions(+), 30 deletions(-) create mode 100644 arch/x86/kvm/xen.c create mode 100644 arch/x86/kvm/xen.h create mode 100644 tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c new file mode 100644 index ..6400a4bc8480 --- /dev/null +++ b/arch/x86/kvm/xen.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. + * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * KVM Xen emulation + */ + +#include "x86.h" +#include "xen.h" + +#include + +#include + +#include "trace.h" + +int kvm_xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) + { + struct kvm *kvm = vcpu->kvm; + u32 page_num = data & ~PAGE_MASK; + u64 page_addr = data & PAGE_MASK; + + /* +* If Xen hypercall intercept is enabled, fill the hypercall +* page with VMCALL/VMMCALL instructions since that's what +* we catch. Else the VMM has provided the hypercall pages +* with instructions of its own choosing, so use those. +*/ + if (kvm_xen_hypercall_enabled(kvm)) { + u8 instructions[32]; + int i; + + if (page_num) + return 1; + + /* mov imm32, %eax */ + instructions[0] = 0xb8; + + /* vmcall / vmmcall */ + kvm_x86_ops.patch_hypercall(vcpu, instructions + 5); + + /* ret */ + instructions[8] = 0xc3; + + /* int3 to pad */ + memset(instructions + 9, 0xcc, sizeof(instructions) - 9); + + for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { + *(u32 *)&instructions[1] = i; + if (kvm_vcpu_write_guest(vcpu, +page_addr + (i * sizeof(instructions)), +instructions, sizeof(instructions))) + return 1; + } HYPERVISOR_iret isn't supported on 64bit so should be ud2 instead. Ankur + } else { + int lm = is_long_mode(vcpu); + u8 *blob_
Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page
On 2020-12-01 5:26 p.m., David Woodhouse wrote On Tue, 2020-12-01 at 16:40 -0800, Ankur Arora wrote: On 2020-12-01 5:07 a.m., David Woodhouse wrote: On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote: +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + struct shared_info *shared_info; + struct page *page; + + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) + return -EINVAL; + + kvm->arch.xen.shinfo_addr = gfn; + + shared_info = page_to_virt(page); + memset(shared_info, 0, sizeof(struct shared_info)); + kvm->arch.xen.shinfo = shared_info; + return 0; +} + Hm. How come we get to pin the page and directly dereference it every time, while kvm_setup_pvclock_page() has to use kvm_write_guest_cached() instead? So looking at my WIP trees from the time, this is something that we went back and forth on as well with using just a pinned page or a persistent kvm_vcpu_map(). OK, thanks. I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page() as shared_info is created early and is not expected to change during the lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map() kvm_vcpu_unmap() dance or do some kind of synchronization. That said, I don't think this code explicitly disallows any updates to shared_info. It needs to allow updates as well as disabling the shared_info pages. We're going to need that to implement SHUTDOWN_soft_reset for kexec. True. If that was allowed, wouldn't it have been a much simpler fix for CVE-2019-3016? What am I missing? Agreed. Perhaps, Paolo can chime in with why KVM never uses pinned page and always prefers to do cached mappings instead? Should I rework these to use kvm_write_guest_cached()? kvm_vcpu_map() would be better. The event channel logic does RMW operations on shared_info->vcpu_info. I've ported the shared_info/vcpu_info parts and made a test case, and was going back through to make it use kvm_write_guest_cached(). But I should probably plug on through the evtchn bits before I do that. I also don't see much locking on the cache, and can't convince myself that accessing the shared_info page from multiple CPUs with kvm_write_guest_cached() or kvm_map_gfn() is sane unless they each have their own cache. I think you could get a VCPU specific cache with kvm_vcpu_map(). What I have so far is at https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xenpv Thanks. Will take a look there. Ankur I'll do the event channel support tomorrow and hook it up to my actual VMM to give it some more serious testing.
Re: [PATCH RFC 02/39] KVM: x86/xen: intercept xen hypercalls if enabled
On 2020-12-02 12:03 a.m., David Woodhouse wrote: On Tue, 2020-12-01 at 21:19 -0800, Ankur Arora wrote: + for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { + *(u32 *)&instructions[1] = i; + if (kvm_vcpu_write_guest(vcpu, + page_addr + (i * sizeof(instructions)), + instructions, sizeof(instructions))) + return 1; + } HYPERVISOR_iret isn't supported on 64bit so should be ud2 instead. Yeah, I got part way through typing that part but concluded it probably wasn't a fast path that absolutely needed to be emulated in the kernel. The VMM can inject the UD# when it receives the hypercall. That would work as well but if it's a straight ud2 on the hypercall page, wouldn't the guest just execute it when/if it does a HYPERVISOR_iret? Ankur I appreciate it *is* a guest-visible difference, if we're being really pedantic, but I don't think we were even going to be able to 100% hide the fact that it's not actually Xen.
Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page
On 2020-12-02 4:20 a.m., David Woodhouse wrote: On Wed, 2020-12-02 at 10:44 +, Joao Martins wrote: [late response - was on holiday yesterday] On 12/2/20 12:40 AM, Ankur Arora wrote: On 2020-12-01 5:07 a.m., David Woodhouse wrote: On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote: +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + struct shared_info *shared_info; + struct page *page; + + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) + return -EINVAL; + + kvm->arch.xen.shinfo_addr = gfn; + + shared_info = page_to_virt(page); + memset(shared_info, 0, sizeof(struct shared_info)); + kvm->arch.xen.shinfo = shared_info; + return 0; +} + Hm. How come we get to pin the page and directly dereference it every time, while kvm_setup_pvclock_page() has to use kvm_write_guest_cached() instead? So looking at my WIP trees from the time, this is something that we went back and forth on as well with using just a pinned page or a persistent kvm_vcpu_map(). I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page() as shared_info is created early and is not expected to change during the lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map() kvm_vcpu_unmap() dance or do some kind of synchronization. That said, I don't think this code explicitly disallows any updates to shared_info. If that was allowed, wouldn't it have been a much simpler fix for CVE-2019-3016? What am I missing? Agreed. Perhaps, Paolo can chime in with why KVM never uses pinned page and always prefers to do cached mappings instead? Part of the CVE fix to not use cached versions. It's not a longterm pin of the page unlike we try to do here (partly due to the nature of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and then unmap the page. See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed"). But I am not sure it's a good idea to follow the same as record_steal_time() given that this is a fairly sensitive code path for event channels. Right. We definitely need to use atomic RMW operations (like the CVE fix did) so the page needs to be *mapped*. My question was about a permanent pinned mapping vs the map/unmap as we need it that record_steal_time() does. On IRC, Paolo told me that permanent pinning causes problems for memory hotplug, and pointed me at the trick we do with an MMU notifier and kvm_vcpu_reload_apic_access_page(). Okay that answers my question. Thanks for clearing that up. Not sure of a good place to document this but it would be good to have this written down somewhere. Maybe kvm_map_gfn()? I'm going to stick with the pinning we have for the moment, and just fix up the fact that it leaks the pinned pages if the guest sets the shared_info address more than once. At some point the apic page MMU notifier thing can be made generic, and we can use that for this and for KVM steal time too. Yeah, that's something that'll definitely be good to have. Ankur
Re: [PATCH RFC 03/39] KVM: x86/xen: register shared_info page
On 2020-12-02 2:44 a.m., Joao Martins wrote: [late response - was on holiday yesterday] On 12/2/20 12:40 AM, Ankur Arora wrote: On 2020-12-01 5:07 a.m., David Woodhouse wrote: On Wed, 2019-02-20 at 20:15 +, Joao Martins wrote: +static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +{ + struct shared_info *shared_info; + struct page *page; + + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) + return -EINVAL; + + kvm->arch.xen.shinfo_addr = gfn; + + shared_info = page_to_virt(page); + memset(shared_info, 0, sizeof(struct shared_info)); + kvm->arch.xen.shinfo = shared_info; + return 0; +} + Hm. How come we get to pin the page and directly dereference it every time, while kvm_setup_pvclock_page() has to use kvm_write_guest_cached() instead? So looking at my WIP trees from the time, this is something that we went back and forth on as well with using just a pinned page or a persistent kvm_vcpu_map(). I remember distinguishing shared_info/vcpu_info from kvm_setup_pvclock_page() as shared_info is created early and is not expected to change during the lifetime of the guest which didn't seem true for MSR_KVM_SYSTEM_TIME (or MSR_KVM_STEAL_TIME) so that would either need to do a kvm_vcpu_map() kvm_vcpu_unmap() dance or do some kind of synchronization. That said, I don't think this code explicitly disallows any updates to shared_info. If that was allowed, wouldn't it have been a much simpler fix for CVE-2019-3016? What am I missing? Agreed. Perhaps, Paolo can chime in with why KVM never uses pinned page and always prefers to do cached mappings instead? Part of the CVE fix to not use cached versions. It's not a longterm pin of the page unlike we try to do here (partly due to the nature of the pages we are mapping) but we still we map the gpa, RMW the steal time struct, and then unmap the page. See record_steal_time() -- but more specifically commit b043138246 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed"). But I am not sure it's a good idea to follow the same as record_steal_time() given that this is a fairly sensitive code path for event channels. Should I rework these to use kvm_write_guest_cached()? kvm_vcpu_map() would be better. The event channel logic does RMW operations on shared_info->vcpu_info. Indeed, yes. Ankur IIRC, we saw missed event channels notifications when we were using the {write,read}_cached() version of the patch. But I can't remember the reason it was due to, either the evtchn_pending or the mask word -- which would make it not inject an upcall. If memory serves, it was the mask. Though I don't think that we had kvm_{write,read}_cached in use at that point -- given that they were definitely not RMW safe. Ankur Joao
Re: [PATCH RFC 10/39] KVM: x86/xen: support upcall vector
On 2020-12-02 11:02 a.m., David Woodhouse wrote: On Wed, 2020-12-02 at 18:34 +, Joao Martins wrote: On 12/2/20 4:47 PM, David Woodhouse wrote: On Wed, 2020-12-02 at 13:12 +, Joao Martins wrote: On 12/2/20 11:17 AM, David Woodhouse wrote: I might be more inclined to go for a model where the kernel handles the evtchn_pending/evtchn_mask for us. What would go into the irq routing table is { vcpu, port# } which get passed to kvm_xen_evtchn_send(). But passing port to the routing and handling the sending of events wouldn't it lead to unnecessary handling of event channels which aren't handled by the kernel, compared to just injecting caring about the upcall? Well, I'm generally in favour of *not* doing things in the kernel that don't need to be there. But if the kernel is going to short-circuit the IPIs and VIRQs, then it's already going to have to handle the evtchn_pending/evtchn_mask bitmaps, and actually injecting interrupts. Right. I was trying to point that out in the discussion we had in next patch. But true be told, more about touting the idea of kernel knowing if a given event channel is registered for userspace handling, rather than fully handling the event channel. I suppose we are able to provide both options to the VMM anyway i.e. 1) letting them handle it enterily in userspace by intercepting EVTCHNOP_send, or through the irq route if we want kernel to offload it. Right. The kernel takes what it knows about and anything else goes up to userspace. I do like the way you've handled the vcpu binding in userspace, and the kernel just knows that a given port goes to a given target CPU. For the VMM API I think we should follow the Xen model, mixing the domain-wide and per-vCPU configuration. It's the best way to faithfully model the behaviour a true Xen guest would experience. So KVM_XEN_ATTR_TYPE_CALLBACK_VIA can be used to set one of • HVMIRQ_callback_vector, taking a vector# • HVMIRQ_callback_gsi for the in-kernel irqchip, taking a GSI# And *maybe* in a later patch it could also handle • HVMIRQ_callback_gsi for split-irqchip, taking an eventfd • HVMIRQ_callback_pci_intx, taking an eventfd (or a pair, for EOI?) Most of the Xen versions we were caring had callback_vector and vcpu callback vector (despite Linux not using the latter). But if you're dating back to 3.2 and 4.1 well (or certain Windows drivers), I suppose gsi and pci-intx are must-haves. Note sure about GSI but PCI-INTX is definitely something I've seen in active use by customers recently. I think SLES10 will use that. I feel we could just accommodate it as subtype in KVM_XEN_ATTR_TYPE_CALLBACK_VIA. Don't see the adavantage in having another xen attr type. Yeah, fair enough. But kinda have mixed feelings in having kernel handling all event channels ABI, as opposed to only the ones userspace asked to offload. It looks a tad unncessary besides the added gain to VMMs that don't need to care about how the internals of event channels. But performance-wise it wouldn't bring anything better. But maybe, the former is reason enough to consider it. Yeah, we'll see. Especially when it comes to implementing FIFO event channels, I'd rather just do it in one place — and if the kernel does it anyway then it's hardly difficult to hook into that. Sorry I'm late to this conversation. Not a whole lot to add to what Joao said. I would only differ with him on how much to offload. Given that we need the fast path in the kernel anyway, I think it's simpler to do all the event-channel bitmap only in the kernel. This would also simplify using the kernel Xen drivers if someone eventually decides to use them. Ankur But I've been about as coherent as I can be in email, and I think we're generally aligned on the direction. I'll do some more experiments and see what I can get working, and what it looks like. I'm focusing on making the shinfo stuff all use kvm_map_gfn() first.
Re: vfio-pci: protect remap_pfn_range() from simultaneous calls
Hi Bharat, So I don't have a final patch for you, but can you test the one below the scissors mark? (The patch is correct, but I'm not happy that it introduces a new lock.) Ankur -- >8 -- Date: Thu, 21 Jan 2021 00:04:36 + Subject: vfio-pci: protect io_remap_pfn_range() from simultaneous calls A fix for CVE-2020-12888 changed the mmap to be dynamic so it gets zapped out and faulted back in based on the state of the PCI_COMMAND register. The original code flow was: vfio_iommu_type1::vfio_device_mmap() vfio_pci::vfio_pci_mmap() remap_pfn_range(vma->vm_start .. vma->vm_end); vfio_iommu_type1::vfio_iommu_type1_ioctl(VFIO_PIN_MAP_DMA) get_user_pages_remote() iommu_map() Which became: vfio_iommu_type1::vfio_device_mmap() vfio_pci::vfio_pci_mmap() /* Setup vm->vm_ops */ vfio_iommu_type1::vfio_iommu_type1_ioctl(VFIO_PIN_MAP_DMA) get_user_pages_remote() follow_fault_pfn(vma, vaddr); /* For missing pages */ fixup_user_fault() handle_mm_fault() vfio_pci::vfio_pci_mmap_fault() io_remap_pfn_range(vma->vm_start .. vma->vm_end); iommu_map() With this, simultaneous calls to VFIO_PIN_MAP_DMA for an overlapping region, would mean potentially multiple calls to io_remap_pfn_range() -- each of which try to remap the full extent. This ends up hitting BUG_ON(!pte_none(*pte)) in remap_pte_range() because we are mapping an already mapped pte. The fix is to elide calls to io_remap_pfn_range() if the VMA is already mapped. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Ankur Arora --- drivers/vfio/pci/vfio_pci.c | 48 ++--- drivers/vfio/pci/vfio_pci_private.h | 2 ++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 706de3ef94bb..db7a2a716f51 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -64,6 +64,11 @@ static bool disable_denylist; module_param(disable_denylist, bool, 0444); MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users."); +struct vdev_vma_priv { + struct vfio_pci_device *vdev; + bool vma_mapped; +}; + static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -1527,15 +1532,20 @@ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) list_for_each_entry_safe(mmap_vma, tmp, &vdev->vma_list, vma_next) { struct vm_area_struct *vma = mmap_vma->vma; + struct vdev_vma_priv *p; if (vma->vm_mm != mm) continue; list_del(&mmap_vma->vma_next); kfree(mmap_vma); + p = vma->vm_private_data; + mutex_lock(&vdev->map_lock); zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + p->vma_mapped = false; + mutex_unlock(&vdev->map_lock); } mutex_unlock(&vdev->vma_lock); mmap_read_unlock(mm); @@ -1591,12 +1601,19 @@ static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, */ static void vfio_pci_mmap_open(struct vm_area_struct *vma) { + struct vdev_vma_priv *p = vma->vm_private_data; + struct vfio_pci_device *vdev = p->vdev; + + mutex_lock(&vdev->map_lock); + p->vma_mapped = false; zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); + mutex_unlock(&vdev->map_lock); } static void vfio_pci_mmap_close(struct vm_area_struct *vma) { - struct vfio_pci_device *vdev = vma->vm_private_data; + struct vdev_vma_priv *p = vma->vm_private_data; + struct vfio_pci_device *vdev = p->vdev; struct vfio_pci_mmap_vma *mmap_vma; mutex_lock(&vdev->vma_lock); @@ -1604,6 +1621,7 @@ static void vfio_pci_mmap_close(struct vm_area_struct *vma) if (mmap_vma->vma == vma) { list_del(&mmap_vma->vma_next); kfree(mmap_vma); + kfree(p); break; } } @@ -1613,7 +1631,8 @@ static void vfio_pci_mmap_close(struct vm_area_struct *vma) static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_device *vdev = vma->vm_private_data; + struct vdev_vma_priv
Re: [EXT] Re: vfio-pci: protect remap_pfn_range() from simultaneous calls
On 2021-03-02 4:47 a.m., Bharat Bhushan wrote: Hi Ankur, -Original Message- From: Ankur Arora Sent: Friday, February 26, 2021 6:24 AM To: Bharat Bhushan Cc: alex.william...@redhat.com; ankur.a.ar...@oracle.com; linux- ker...@vger.kernel.org; Sunil Kovvuri Goutham ; termi...@gmail.com Subject: [EXT] Re: vfio-pci: protect remap_pfn_range() from simultaneous calls External Email -- Hi Bharat, Can you test the patch below to see if it works for you? Sorry for late reply, I actually missed this email. Reproducibility of the issue was low in my test scenario, one out of ~15 runs. I run it multiple times, overnight and observed no issues. Awesome. Would you mind giving me your Tested-by for the patch? Also could you add some more detail to your earlier description of the bug? Our test case is running ODP multi-threaded application, where parent process maps (yes it uses MAP_DMA) the region and then child processes access same. As a workaround we tried accessing the region once by parent process before creating other accessor threads and it worked as expected. Thanks for the detail. So if the child processes start early -- they might fault while the VFIO_IOMMU_MAP_DMA was going on. And, since they only acquire mmap_lock in RO mode, both paths would end up calling io_remap_pfn_range() via the fault handler. Thanks Ankur Thanks -Bharat In particular, AFAICS you are using ODP (-DPDK?) with multiple threads touching this region. From your stack, it looks like the fault was user-space generated, and I'm guessing you were not using the VFIO_IOMMU_MAP_DMA. Ankur -- >8 -- Subject: [PATCH] vfio-pci: protect io_remap_pfn_range() from simultaneous calls vfio_pci_mmap_fault() maps the complete VMA on fault. With concurrent faults, this would result in multiple calls to io_remap_pfn_range(), where it would hit a BUG_ON(!pte_none(*pte)) in remap_pte_range(). (It would also link the same VMA multiple times in vdev->vma_list but given the BUG_ON that is less serious.) Normally, however, this won't happen -- at least with vfio_iommu_type1 -- the VFIO_IOMMU_MAP_DMA path is protected by iommu->lock. If, however, we are using some kind of parallelization mechanism like this one with ktask under discussion [1], we would hit this. Even if we were doing this serially, given that vfio-pci remaps a larger extent than strictly necessary it should internally enforce coherence of its data structures. Handle this by using the VMA's presence in the vdev->vma_list as indicative of a fully mapped VMA and returning success early to all but the first VMA fault. Note that this is clearly optimstic given that the mapping is ongoing, and might mean that the caller sees more faults until the remap is done. [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__lore.kernel.org_linux- 2Dmm_20181105145141.6f9937f6- 40w520.home_&d=DwIDAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=PAAlWswPe7d8gHl GbCLmy2YezyK7O3Hv_t2heGnouBw&m=3ZDXqnn9xNUCjgXwN9mHIKT7oyXu55P U7yV2j0b-5hw&s=hiICkNtrcH4AbAWRrbkvMUylp7Bv0YHFCjxNGC6CGOk&e= Signed-off-by: Ankur Arora --- drivers/vfio/pci/vfio_pci.c | 25 - 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 65e7e6b44578..b9f509863db1 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1573,6 +1573,11 @@ static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, { struct vfio_pci_mmap_vma *mmap_vma; + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + if (mmap_vma->vma == vma) + return 1; + } + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); if (!mmap_vma) return -ENOMEM; @@ -1613,6 +1618,7 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct vfio_pci_device *vdev = vma->vm_private_data; vm_fault_t ret = VM_FAULT_NOPAGE; + int vma_present; mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); @@ -1623,7 +1629,21 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) goto up_out; } - if (__vfio_pci_add_vma(vdev, vma)) { + /* +* __vfio_pci_add_vma() either adds the vma to the vdev->vma_list +* (vma_present == 0), or indicates that the vma is already present +* on the list (vma_present == 1). +* +* Overload the meaning of this flag to also imply that the vma is +* fully mapped. This allows us to serialize the mapping -- ensuring +* that simultaneous faults will not both try to call +* io_remap_pfn_range(). +* +* However, this might mean that callers to which we returne
Re: vfio-pci: protect remap_pfn_range() from simultaneous calls
On 2021-01-06 8:17 a.m., Bharat Bhushan wrote: Hi Ankur, We are observing below BUG_ON() with latest kernel [10011.321645] [ cut here ] [10011.322262] kernel BUG at mm/memory.c:1816! [10011.323793] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [10011.326108] CPU: 2 PID: 1147 Comm: odp_l2fwd Not tainted 5.4.74-05938-gb9598e49fe61 #15 [10011.328272] Hardware name: Marvell CN106XX board (DT) [10011.330328] pstate: 8049 (Nzcv daif +PAN -UAO) [10011.332402] pc : remap_pfn_range+0x1a4/0x260 [10011.334383] lr : remap_pfn_range+0x14c/0x260 [10011.335911] sp : 8000156afc10 [10011.337360] x29: 8000156afc10 x28: ffdffa24 [10011.339671] x27: 00014a241000 x26: 00218200 [10011.341984] x25: 0001489fbe00 x24: 00218204 [10011.344279] x23: 00218204 x22: 00680fc3 [10011.346539] x21: 00218204 x20: 000149d70860 [10011.348846] x19: 0041 x18: [10011.351064] x17: x16: [10011.353304] x15: x14: [10011.355519] x13: x12: [10011.357812] x11: x10: ffdfffe0 [10011.360136] x9 : x8 : [10011.362414] x7 : x6 : 04218200 [10011.364773] x5 : 0001 x4 : [10011.367103] x3 : ffe000328928 x2 : 016800017c240fc3 [10011.369462] x1 : x0 : ffe000328928 [10011.371694] Call trace: [10011.373510] remap_pfn_range+0x1a4/0x260 [10011.375386] vfio_pci_mmap_fault+0x9c/0x114 [10011.377346] __do_fault+0x38/0x100 [10011.379253] __handle_mm_fault+0x81c/0xce4 [10011.381247] handle_mm_fault+0xb4/0x17c [10011.383220] do_page_fault+0x110/0x430 [10011.385188] do_translation_fault+0x80/0x90 [10011.387069] do_mem_abort+0x3c/0xa0 [10011.388852] el0_da+0x20/0x24 [10011.391239] Code: eb1a02ff 5480 f9400362 b4fffe42 (d421) [10011.393306] ---[ end trace ae8b75b32426d53c ]--- [10011.395140] note: odp_l2fwd[1147] exited with preempt_count 2 This is observed after patch "vfio-pci: Fault mmaps to enable vma tracking" where actual mapping delayed on page fault. When address of same page accessed by multiple threads at/around same time by threads running on different cores causes page fault for same page on multiple cores at same time. One of the fault hander creates mapping while second hander find that page-table mapping already exists and leads to above kernel BUG_ON(). Yeah, that's what my fix addressed as well. While article https://lwn.net/Articles/828536/ suggest that you have already faced and fixed this issue "- vfio-pci: protect remap_pfn_range() from simultaneous calls (Ankur Arora) [Orabug: 31663628] {CVE-2020-12888} {CVE-2020-12888}" But I do not see any patch submitted or under review in upstream, hopefully I did not missed some discussion. Please let us know in case you already submitted or planning to submit fix or someone else fixed same. No you haven't missed a discussion on this. For upstream this was more of a theoretical race so I dallied a bit before sending the patch upstream. I'll submit a patch soon. Also, would you mind if I ask you to run this failing test before submission? Thanks Ankur Thanks -Bharat
Re: [PATCH RFC 00/39] x86/KVM: Xen HVM guest support
On 2/20/19 1:09 PM, Paolo Bonzini wrote: On 20/02/19 21:15, Joao Martins wrote: 2. PV Driver support (patches 17 - 39) We start by redirecting hypercalls from the backend to routines which emulate the behaviour that PV backends expect i.e. grant table and interdomain events. Next, we add support for late initialization of xenbus, followed by implementing frontend/backend communication mechanisms (i.e. grant tables and interdomain event channels). Finally, introduce xen-shim.ko, which will setup a limited Xen environment. This uses the added functionality of Xen specific shared memory (grant tables) and notifications (event channels). I am a bit worried by the last patches, they seem really brittle and prone to breakage. I don't know Xen well enough to understand if the lack of support for GNTMAP_host_map is fixable, but if not, you have to define a completely different hypercall. I assume you are aware of most of this, but just in case, here's the flow when a backend driver wants to map a grant-reference in the host: it allocates an empty struct page (via ballooning) and does a map_grant_ref(GNTMAP_host_map) hypercall. In response, Xen validates the grant-reference and maps it onto the address associated with the struct page. After this, from the POV of the underlying network/block drivers, these struct pages can be used as just regular pages. To support this in a KVM environment, where AFAICS no remapping of pages is possible, the idea was to make minimal changes to the backend drivers such that map_grant_ref() could just return the PFN from which the backend could derive the struct page. To ensure that backends -- when running in this environment -- have been modified to deal with these new semantics, our map_grant_ref() implementation explicitly disallows the GNTMAP_host_map flag. Now if I'm reading you right, you would prefer something more straightforward -- perhaps similar semantics but a new flag that makes this behaviour explicit? Of course, tests are missing. You should use the tools/testing/selftests/kvm/ framework, and ideally each patch should come with coverage for the newly-added code. Agreed. Thanks Ankur Thanks, Paolo
Re: [Xen-devel] [PATCH RFC 00/39] x86/KVM: Xen HVM guest support
On 2/20/19 3:39 PM, Marek Marczykowski-Górecki wrote: On Wed, Feb 20, 2019 at 08:15:30PM +, Joao Martins wrote: 2. PV Driver support (patches 17 - 39) We start by redirecting hypercalls from the backend to routines which emulate the behaviour that PV backends expect i.e. grant table and interdomain events. Next, we add support for late initialization of xenbus, followed by implementing frontend/backend communication mechanisms (i.e. grant tables and interdomain event channels). Finally, introduce xen-shim.ko, which will setup a limited Xen environment. This uses the added functionality of Xen specific shared memory (grant tables) and notifications (event channels). Does it mean backends could be run in another guest, similarly as on real Xen? AFAIK virtio doesn't allow that as virtio backends need I'm afraid not. For now grant operations (map/unmap) can only be done by backends to the local KVM instance. Ankur arbitrary write access to guest memory. But grant tables provide enough abstraction to do that safely.
[RFC PATCH 10/16] xen/balloon: support ballooning in xenhost_t
Xen ballooning uses hollow struct pages (with the underlying GFNs being populated/unpopulated via hypercalls) which are used by the grant logic to map grants from other domains. This patch allows the default xenhost to provide an alternate ballooning allocation mechanism. This is expected to be useful for local xenhosts (type xenhost_r0) because unlike Xen, where there is an external hypervisor which can change the memory underneath a GFN, that is not possible when the hypervisor is running in the same address space as the entity doing the ballooning. Co-developed-by: Ankur Arora Signed-off-by: Joao Martins Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten_hvm.c | 7 +++ arch/x86/xen/enlighten_pv.c| 8 drivers/xen/balloon.c | 19 --- drivers/xen/grant-table.c | 4 ++-- drivers/xen/privcmd.c | 4 ++-- drivers/xen/xen-selfballoon.c | 2 ++ drivers/xen/xenbus/xenbus_client.c | 6 +++--- drivers/xen/xlate_mmu.c| 4 ++-- include/xen/balloon.h | 4 ++-- include/xen/xenhost.h | 19 +++ 10 files changed, 63 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index efe483ceeb9a..a371bb9ee478 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -130,9 +130,16 @@ xenhost_ops_t xh_hvm_ops = { .setup_shared_info = xen_hvm_init_shared_info, .reset_shared_info = xen_hvm_reset_shared_info, .probe_vcpu_id = xen_hvm_probe_vcpu_id, + + /* We just use the default method of ballooning. */ + .alloc_ballooned_pages = NULL, + .free_ballooned_pages = NULL, }; xenhost_ops_t xh_hvm_nested_ops = { + /* Nested xenhosts, are disallowed ballooning */ + .alloc_ballooned_pages = NULL, + .free_ballooned_pages = NULL, }; static void __init init_hvm_pv_info(void) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 77b1a0d4aef2..2e94e02cdbb4 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1247,11 +1247,19 @@ xenhost_ops_t xh_pv_ops = { .reset_shared_info = xen_pv_reset_shared_info, .probe_vcpu_id = xen_pv_probe_vcpu_id, + + /* We just use the default method of ballooning. */ + .alloc_ballooned_pages = NULL, + .free_ballooned_pages = NULL, }; xenhost_ops_t xh_pv_nested_ops = { .cpuid_base = xen_pv_nested_cpuid_base, .setup_hypercall_page = NULL, + + /* Nested xenhosts, are disallowed ballooning */ + .alloc_ballooned_pages = NULL, + .free_ballooned_pages = NULL, }; /* First C function to be called on Xen boot */ diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5ef4d6ad920d..08becf574743 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -63,6 +63,7 @@ #include #include +#include #include #include @@ -583,12 +584,21 @@ static int add_ballooned_pages(int nr_pages) * @pages: pages returned * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages) +int alloc_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages) { int pgno = 0; struct page *page; int ret; + /* +* xenmem transactions for remote xenhost are disallowed. +*/ + if (xh->type == xenhost_r2) + return -EINVAL; + + if (xh->ops->alloc_ballooned_pages) + return xh->ops->alloc_ballooned_pages(xh, nr_pages, pages); + mutex_lock(&balloon_mutex); balloon_stats.target_unpopulated += nr_pages; @@ -620,7 +630,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) return 0; out_undo: mutex_unlock(&balloon_mutex); - free_xenballooned_pages(pgno, pages); + free_xenballooned_pages(xh, pgno, pages); return ret; } EXPORT_SYMBOL(alloc_xenballooned_pages); @@ -630,10 +640,13 @@ EXPORT_SYMBOL(alloc_xenballooned_pages); * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page **pages) +void free_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages) { int i; + if (xh->ops->free_ballooned_pages) + return xh->ops->free_ballooned_pages(xh, nr_pages, pages); + mutex_lock(&balloon_mutex); for (i = 0; i < nr_pages; i++) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 98af259d0d4f..ec90769907a4 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -804,7 +804,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) { int ret; - ret = alloc_xenballooned_pages(nr_pages, pages); + ret = alloc_xenballooned_pages(xh_default, nr_pages, pages); if (ret < 0)
[RFC PATCH 13/16] drivers/xen: gnttab, evtchn, xenbus API changes
Mechanical changes, now most of these calls take xenhost_t * as parameter. Co-developed-by: Joao Martins Signed-off-by: Ankur Arora --- drivers/xen/cpu_hotplug.c | 14 ++--- drivers/xen/gntalloc.c| 13 drivers/xen/gntdev.c | 16 +++ drivers/xen/manage.c | 37 ++- drivers/xen/platform-pci.c| 12 +++- drivers/xen/sys-hypervisor.c | 12 drivers/xen/xen-balloon.c | 10 +++--- drivers/xen/xenfs/xenstored.c | 7 --- 8 files changed, 73 insertions(+), 48 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index afeb94446d34..4a05bc028956 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -31,13 +31,13 @@ static void disable_hotplug_cpu(int cpu) unlock_device_hotplug(); } -static int vcpu_online(unsigned int cpu) +static int vcpu_online(xenhost_t *xh, unsigned int cpu) { int err; char dir[16], state[16]; sprintf(dir, "cpu/%u", cpu); - err = xenbus_scanf(xh_default, XBT_NIL, dir, "availability", "%15s", state); + err = xenbus_scanf(xh, XBT_NIL, dir, "availability", "%15s", state); if (err != 1) { if (!xen_initial_domain()) pr_err("Unable to read cpu state\n"); @@ -52,12 +52,12 @@ static int vcpu_online(unsigned int cpu) pr_err("unknown state(%s) on CPU%d\n", state, cpu); return -EINVAL; } -static void vcpu_hotplug(unsigned int cpu) +static void vcpu_hotplug(xenhost_t *xh, unsigned int cpu) { if (!cpu_possible(cpu)) return; - switch (vcpu_online(cpu)) { + switch (vcpu_online(xh, cpu)) { case 1: enable_hotplug_cpu(cpu); break; @@ -78,7 +78,7 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, cpustr = strstr(path, "cpu/"); if (cpustr != NULL) { sscanf(cpustr, "cpu/%u", &cpu); - vcpu_hotplug(cpu); + vcpu_hotplug(watch->xh, cpu); } } @@ -93,7 +93,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier, (void)register_xenbus_watch(xh_default, &cpu_watch); for_each_possible_cpu(cpu) { - if (vcpu_online(cpu) == 0) { + if (vcpu_online(cpu_watch.xh, cpu) == 0) { (void)cpu_down(cpu); set_cpu_present(cpu, false); } @@ -114,7 +114,7 @@ static int __init setup_vcpu_hotplug_event(void) #endif return -ENODEV; - register_xenstore_notifier(&xsn_cpu); + register_xenstore_notifier(xh_default, &xsn_cpu); return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index e07823886fa8..a490e4e8c854 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -79,6 +79,8 @@ static LIST_HEAD(gref_list); static DEFINE_MUTEX(gref_mutex); static int gref_size; +static xenhost_t *xh; + struct notify_info { uint16_t pgoff:12;/* Bits 0-11: Offset of the byte to clear */ uint16_t flags:2; /* Bits 12-13: Unmap notification flags */ @@ -144,7 +146,7 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, } /* Grant foreign access to the page. */ - rc = gnttab_grant_foreign_access(op->domid, + rc = gnttab_grant_foreign_access(xh, op->domid, xen_page_to_gfn(gref->page), readonly); if (rc < 0) @@ -196,13 +198,13 @@ static void __del_gref(struct gntalloc_gref *gref) gref->notify.flags = 0; if (gref->gref_id) { - if (gnttab_query_foreign_access(gref->gref_id)) + if (gnttab_query_foreign_access(xh, gref->gref_id)) return; - if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) + if (!gnttab_end_foreign_access_ref(xh, gref->gref_id, 0)) return; - gnttab_free_grant_reference(gref->gref_id); + gnttab_free_grant_reference(xh, gref->gref_id); } gref_size--; @@ -586,6 +588,9 @@ static int __init gntalloc_init(void) if (!xen_domain()) return -ENODEV; + /* Limit to default xenhost for now. */ + xh = xh_default; + err = misc_register(&gntalloc_miscdev); if (err != 0) { pr_err("Could not register misc gntalloc device\n"); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 0f0c951cd5b1..40a42abe2dd0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -67,6 +67,8 @@ static atomic_t pa
[RFC PATCH 14/16] xen/blk: gnttab, evtchn, xenbus API changes
For the most part, we now pass xenhost_t * as a parameter. Co-developed-by: Joao Martins Signed-off-by: Ankur Arora --- drivers/block/xen-blkback/blkback.c | 34 + drivers/block/xen-blkback/common.h | 2 +- drivers/block/xen-blkback/xenbus.c | 63 - drivers/block/xen-blkfront.c| 103 +++- 4 files changed, 107 insertions(+), 95 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 7ad4423c24b8..d366a17a4bd8 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -142,7 +142,7 @@ static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt) HZ * xen_blkif_pgrant_timeout); } -static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) +static inline int get_free_page(xenhost_t *xh, struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; @@ -150,7 +150,7 @@ static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) if (list_empty(&ring->free_pages)) { BUG_ON(ring->free_pages_num != 0); spin_unlock_irqrestore(&ring->free_pages_lock, flags); - return gnttab_alloc_pages(1, page); + return gnttab_alloc_pages(xh, 1, page); } BUG_ON(ring->free_pages_num == 0); page[0] = list_first_entry(&ring->free_pages, struct page, lru); @@ -174,7 +174,7 @@ static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **pag spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) +static inline void shrink_free_pagepool(xenhost_t *xh, struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; @@ -190,14 +190,14 @@ static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { spin_unlock_irqrestore(&ring->free_pages_lock, flags); - gnttab_free_pages(num_pages, page); + gnttab_free_pages(xh, num_pages, page); spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) - gnttab_free_pages(num_pages, page); + gnttab_free_pages(xh, num_pages, page); } #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) @@ -301,8 +301,8 @@ static void put_persistent_gnt(struct xen_blkif_ring *ring, atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, - unsigned int num) +static void free_persistent_gnts(xenhost_t *xh, struct xen_blkif_ring *ring, + struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -314,6 +314,7 @@ static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *ro unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + unmap_data.xh = xh; foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -351,10 +352,12 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) int segs_to_unmap = 0; struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; + struct xenbus_device *dev = xen_blkbk_xenbus(ring->blkif->be); unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + unmap_data.xh = dev->xh; while(!list_empty(&ring->persistent_purge_list)) { persistent_gnt = list_first_entry(&ring->persistent_purge_list, @@ -615,6 +618,7 @@ int xen_blkif_schedule(void *arg) struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; + struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be); set_freezable(); while (!kthread_should_stop()) { @@ -657,7 +661,7 @@ int xen_blkif_schedule(void *arg) } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); + shrink_free_pagepool(dev->xh,
[RFC PATCH 03/16] x86/xen: make hypercall_page generic
Make hypercall_page a generic interface which can be implemented by other hypervisors. With this change, hypercall_page now points to the newly introduced xen_hypercall_page which is seeded by Xen, or to one that is filled in by a different hypervisor. Signed-off-by: Ankur Arora --- arch/x86/include/asm/xen/hypercall.h | 12 +++- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/enlighten_hvm.c | 3 ++- arch/x86/xen/enlighten_pv.c | 1 + arch/x86/xen/enlighten_pvh.c | 3 ++- arch/x86/xen/xen-asm_32.S| 2 +- arch/x86/xen/xen-asm_64.S| 2 +- arch/x86/xen/xen-head.S | 8 8 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ef05bea7010d..1a3cd6680e6f 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -86,11 +86,13 @@ struct xen_dm_op_buf; * there aren't more than 5 arguments...) */ -extern struct { char _entry[32]; } hypercall_page[]; +struct hypercall_entry { char _entry[32]; }; +extern struct hypercall_entry xen_hypercall_page[128]; +extern struct hypercall_entry *hypercall_page; -#define __HYPERCALL"call hypercall_page+%c[offset]" +#define __HYPERCALLCALL_NOSPEC #define __HYPERCALL_ENTRY(x) \ - [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) + [thunk_target] "0" (hypercall_page + __HYPERVISOR_##x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -116,7 +118,7 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT +#define __HYPERCALL_0PARAM "=&r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) @@ -208,7 +210,7 @@ xen_single_call(unsigned int call, asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM -: [thunk_target] "a" (&hypercall_page[call]) +: [thunk_target] "0" (hypercall_page + call) : __HYPERCALL_CLOBBER5); return (long)__res; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 750f46ad018a..e9dc92e79afa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,6 +20,7 @@ #include "smp.h" #include "pmu.h" +struct hypercall_entry *hypercall_page; EXPORT_SYMBOL_GPL(hypercall_page); /* diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index ffc5791675b2..4d85cd2ff261 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -115,8 +115,9 @@ static void __init init_hvm_pv_info(void) pv_info.name = "Xen HVM"; msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); + pfn = __pa(xen_hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + hypercall_page = xen_hypercall_page; } xen_setup_features(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index a4e04b0cc596..3239e8452ede 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1217,6 +1217,7 @@ asmlinkage __visible void __init xen_start_kernel(void) if (!xen_start_info) return; + hypercall_page = xen_hypercall_page; xenhost_register(xenhost_r1, &xh_pv_ops); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index c07eba169572..e47866fcb7ea 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -46,8 +46,9 @@ void __init xen_pvh_init(void) xen_start_flags = pvh_start_info.flags; msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); + pfn = __pa(xen_hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + hypercall_page = xen_hypercall_page; } void __init mem_map_via_hcall(struct boot_params *boot_params_p) diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index c15db060a242..ee4998055ea9 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -121,7 +121,7 @@ xen_iret_end_crit: hyper_iret: /* put this out of line since its very rarely used */ - jmp hypercall_page + __HYPERVISOR_iret * 32 + jmp xen_hypercall_page + __HYPERVISOR_iret * 32 .globl xen_iret_start_crit, xen_iret_end_crit d
[RFC PATCH 16/16] xen/grant-table: host_addr fixup in mapping on xenhost_r0
Xenhost type xenhost_r0 does not support standard GNTTABOP_map_grant_ref semantics (map a gref onto a specified host_addr). That's because since the hypervisor is local (same address space as the caller of GNTTABOP_map_grant_ref), there is no external entity that could map an arbitrary page underneath an arbitrary address. To handle this, the GNTTABOP_map_grant_ref hypercall on xenhost_r0 treats the host_addr as an OUT parameter instead of IN and expects the gnttab_map_refs() and similar to fixup any state that caches the value of host_addr from before the hypercall. Accordingly gnttab_map_refs() now adds two parameters, a fixup function and a pointer to cached maps to fixup: int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) + struct page **pages, gnttab_map_fixup_t map_fixup_fn, + void **map_fixup[], unsigned int count) The reason we use a fixup function and not an additional mapping op in the xenhost_t is because, depending on the caller, what we are fixing might be different: blkback, netback for instance cache host_addr in via a struct page *, while __xenbus_map_ring() caches a phys_addr. This patch fixes up xen-blkback and xen-gntdev drivers. TODO: - also rewrite gnttab_batch_map() and __xenbus_map_ring(). - modify xen-netback, scsiback, pciback etc Co-developed-by: Joao Martins Signed-off-by: Ankur Arora --- drivers/block/xen-blkback/blkback.c | 14 +- drivers/xen/gntdev.c| 2 +- drivers/xen/grant-table.c | 20 ++-- include/xen/grant_table.h | 11 ++- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index d366a17a4bd8..50ce40ba35e5 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -806,11 +806,18 @@ static void xen_blkbk_unmap(struct xen_blkif_ring *ring, } } +static void blkbk_map_fixup(uint64_t host_addr, void **fixup) +{ + struct page **pg = (struct page **)fixup; + *pg = virt_to_page(host_addr); +} + static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page **map_fixup[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt = NULL; phys_addr_t addr = 0; @@ -858,6 +865,9 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, gnttab_set_map_op(&map[segs_to_map++], addr, flags, pages[i]->gref, blkif->domid); + + if (gnttab_map_fixup(dev->xh)) + map_fixup[i] = &pages[i]->page; } map_until = i + 1; if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) @@ -865,7 +875,9 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, } if (segs_to_map) { - ret = gnttab_map_refs(dev->xh, map, NULL, pages_to_gnt, segs_to_map); + ret = gnttab_map_refs(dev->xh, map, NULL, pages_to_gnt, + gnttab_map_fixup(dev->xh) ? blkbk_map_fixup : NULL, + (void ***) map_fixup, segs_to_map); BUG_ON(ret); } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 40a42abe2dd0..32c6471834ba 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -342,7 +342,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) pr_debug("map %d+%d\n", map->index, map->count); err = gnttab_map_refs(xh, map->map_ops, use_ptemod ? map->kmap_ops : NULL, - map->pages, map->count); + map->pages, NULL, NULL, map->count); if (err) return err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 959b81ade113..2f3a0a4a2660 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1084,7 +1084,8 @@ void gnttab_foreach_grant(struct page **pages, int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) + struct page **pages, gnttab_map_fixup_t map_fixup_fn, + void **map_fixup[], unsigned int count) { int i, ret; @@ -1096,12 +1097,19 @@ int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops,
[RFC PATCH 02/16] x86/xen: cpuid support in xenhost_t
xen_cpuid_base() is used to probe and setup features early in a guest's lifetime. We want this to behave differently depending on xenhost->type: for instance, local xenhosts cannot intercept the cpuid instruction at all. Add op (*cpuid_base)() in xenhost_ops_t. Signed-off-by: Ankur Arora --- arch/x86/include/asm/xen/hypervisor.h | 2 +- arch/x86/pci/xen.c| 2 +- arch/x86/xen/enlighten_hvm.c | 7 +-- arch/x86/xen/enlighten_pv.c | 16 +++- arch/x86/xen/enlighten_pvh.c | 4 drivers/tty/hvc/hvc_xen.c | 2 +- drivers/xen/grant-table.c | 3 ++- drivers/xen/xenbus/xenbus_xs.c| 3 ++- include/xen/xenhost.h | 21 + 9 files changed, 52 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 39171b3646bb..6c4cdcdf997d 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -53,7 +53,7 @@ static inline bool xen_x2apic_para_available(void) #else static inline bool xen_x2apic_para_available(void) { - return (xen_cpuid_base() != 0); + return (xen_cpuid_base(NULL) != 0); } #endif diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 9112d1cb397b..d1a3b9f08289 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -431,7 +431,7 @@ void __init xen_msi_init(void) * event channels for MSI handling and instead use regular * APIC processing */ - uint32_t eax = cpuid_eax(xen_cpuid_base() + 4); + uint32_t eax = cpuid_eax(xenhost_cpuid_base(xh_default) + 4); if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) || ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC))) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 100452f4f44c..ffc5791675b2 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -83,7 +83,10 @@ static void __init xen_hvm_init_mem_mapping(void) xen_vcpu_info_reset(0); } +extern uint32_t xen_pv_cpuid_base(xenhost_t *xh); + xenhost_ops_t xh_hvm_ops = { + .cpuid_base = xen_pv_cpuid_base, }; xenhost_ops_t xh_hvm_nested_ops = { @@ -94,7 +97,7 @@ static void __init init_hvm_pv_info(void) int major, minor; uint32_t eax, ebx, ecx, edx, base; - base = xen_cpuid_base(); + base = xenhost_cpuid_base(xh_default); eax = cpuid_eax(base + 1); major = eax >> 16; @@ -250,7 +253,7 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain() || xen_nopv) return 0; - return xen_cpuid_base(); + return xenhost_cpuid_base(xh_default); } static __init void xen_hvm_guest_late_init(void) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index bb6e811c1525..a4e04b0cc596 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1189,10 +1189,23 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +uint32_t xen_pv_cpuid_base(xenhost_t *xh) +{ + return hypervisor_cpuid_base("XenVMMXenVMM", 2); +} + +uint32_t xen_pv_nested_cpuid_base(xenhost_t *xh) +{ + return hypervisor_cpuid_base("XenVMMXenVMM", + 2 /* nested specific leaf? */); +} + xenhost_ops_t xh_pv_ops = { + .cpuid_base = xen_pv_cpuid_base, }; xenhost_ops_t xh_pv_nested_ops = { + .cpuid_base = xen_pv_nested_cpuid_base, }; /* First C function to be called on Xen boot */ @@ -1469,7 +1482,8 @@ static int xen_cpu_dead_pv(unsigned int cpu) static uint32_t __init xen_platform_pv(void) { if (xen_pv_domain()) - return xen_cpuid_base(); + /* xenhost is setup in xen_start_kernel. */ + return xenhost_cpuid_base(xh_default); return 0; } diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 826c296d27a3..c07eba169572 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -29,6 +29,10 @@ void __init xen_pvh_init(void) u32 msr; u64 pfn; + /* +* Note: we have already called xen_cpuid_base() in +* hypervisor_specific_init() +*/ xenhost_register(xenhost_r1, &xh_hvm_ops); /* diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index dc43fa96c3de..5e5ca35d7187 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -595,7 +595,7 @@ console_initcall(xen_cons_init); #ifdef CONFIG_X86 static void xen_hvm_early_write(uint32_t vtermno, const char *str, int len) { - if (xen_cpuid_base()) + if (xen_cpuid_base(xh_default)) outsb(0xe9, str, len); } #else diff --git
[RFC PATCH 00/16] xenhost support
Hi all, This is an RFC for xenhost support, outlined here by Juergen here: https://lkml.org/lkml/2019/4/8/67. The high level idea is to provide an abstraction of the Xen communication interface, as a xenhost_t. xenhost_t expose ops for communication between the guest and Xen (hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on top of those, xenbus, ballooning), and these can differ based on the kind of underlying Xen: regular, local, and nested. (Since this abstraction is largely about guest -- xenhost communication, no ops are needed for timer, clock, sched, memory (MMU, P2M), VCPU mgmt. etc.) Xenhost use-cases: Regular-Xen: the standard Xen interface presented to a guest, specifically for comunication between Lx-guest and Lx-Xen. Local-Xen: a Xen like interface which runs in the same address space as the guest (dom0). This, can act as the default xenhost. The major ways it differs from a regular Xen interface is in presenting a different hypercall interface (call instead of a syscall/vmcall), and in an inability to do grant-mappings: since local-Xen exists in the same address space as Xen, there's no way for it to cheaply change the physical page that a GFN maps to (assuming no P2M tables.) Nested-Xen: this channel is to Xen, one level removed: from L1-guest to L0-Xen. The use case is that we want L0-dom0-backends to talk to L1-dom0-frontend drivers which can then present PV devices which can in-turn be used by the L1-dom0-backend drivers as raw underlying devices. The interfaces themselves, broadly remain similar. Note: L0-Xen, L1-Xen represent Xen running at that nesting level and L0-guest, L1-guest represent guests that are children of Xen at that nesting level. Lx, represents any level. Patches 1-7, "x86/xen: add xenhost_t interface" "x86/xen: cpuid support in xenhost_t" "x86/xen: make hypercall_page generic" "x86/xen: hypercall support for xenhost_t" "x86/xen: add feature support in xenhost_t" "x86/xen: add shared_info support to xenhost_t" "x86/xen: make vcpu_info part of xenhost_t" abstract out interfaces that setup hypercalls/cpuid/shared_info/vcpu_info etc. Patch 8, "x86/xen: irq/upcall handling with multiple xenhosts" sets up the upcall and pv_irq ops based on vcpu_info. Patch 9, "xen/evtchn: support evtchn in xenhost_t" adds xenhost based evtchn support for evtchn_2l. Patches 10 and 16, "xen/balloon: support ballooning in xenhost_t" and "xen/grant-table: host_addr fixup in mapping on xenhost_r0" implement support from GNTTABOP_map_grant_ref for xenhosts of type xenhost_r0 (xenhost local.) Patch 12, "xen/xenbus: support xenbus frontend/backend with xenhost_t" makes xenbus so that both its frontend and backend can be bootstrapped separately via separate xenhosts. Remaining patches, 11, 13, 14, 15: "xen/grant-table: make grant-table xenhost aware" "drivers/xen: gnttab, evtchn, xenbus API changes" "xen/blk: gnttab, evtchn, xenbus API changes" "xen/net: gnttab, evtchn, xenbus API changes" are mostly mechanical changes for APIs that now take xenhost_t * as parameter. The code itself is RFC quality, and is mostly meant to get feedback before proceeding further. Also note that the FIFO logic and some Xen drivers (input, pciback, scsi etc) are mostly unchanged, so will not build. Please take a look. Thanks Ankur Ankur Arora (16): x86/xen: add xenhost_t interface x86/xen: cpuid support in xenhost_t x86/xen: make hypercall_page generic x86/xen: hypercall support for xenhost_t x86/xen: add feature support in xenhost_t x86/xen: add shared_info support to xenhost_t x86/xen: make vcpu_info part of xenhost_t x86/xen: irq/upcall handling with multiple xenhosts xen/evtchn: support evtchn in xenhost_t xen/balloon: support ballooning in xenhost_t xen/grant-table: make grant-table xenhost aware xen/xenbus: support xenbus frontend/backend with xenhost_t drivers/xen: gnttab, evtchn, xenbus API changes xen/blk: gnttab, evtchn, xenbus API changes xen/net: gnttab, evtchn, xenbus API changes xen/grant-table: host_addr fixup in mapping on xenhost_r0 arch/x86/include/asm/xen/hypercall.h | 239 +--- arch/x86/include/asm/xen/hypervisor.h | 3 +- arch/x86/pci/xen.c | 18 +- arch/x86/xen/Makefile | 3 +- arch/x86/xen/enlighten.c | 101 ++-- arch/x86/xen/enlighten_hvm.c | 185 -- arch/x86/xen/enlighten_pv.c| 144 - arch/x86/xen/enlighten_pvh.c | 25 +- arch/x86/xen/grant-table.c | 71 ++- arch/x86/xen/irq.c | 75 ++- arch/x86/xen/mmu_pv.c | 6 +- arch/x86/xen/p2m.c | 24 +- arch/x86/xen/pci-swiotlb-xen.c | 1 + arch/x86/xen/s
[RFC PATCH 07/16] x86/xen: make vcpu_info part of xenhost_t
Abstract out xen_vcpu_id probing via (*probe_vcpu_id)(). Once that is availab,e the vcpu_info registration happens via the VCPUOP hypercall. Note that for the nested case, there are two vcpu_ids, and two vcpu_info areas, one each for the default xenhost and the remote xenhost. The vcpu_info is used via pv_irq_ops, and evtchn signaling. The other VCPUOP hypercalls are used for management (and scheduling) which is expected to be done purely in the default hypervisor. However, scheduling of L1-guest does imply L0-Xen-vcpu_info switching, which might mean that the remote hypervisor needs some visibility into related events/hypercalls in the default hypervisor. TODO: - percpu data structures for xen_vcpu Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten.c | 93 +--- arch/x86/xen/enlighten_hvm.c | 87 ++ arch/x86/xen/enlighten_pv.c | 60 ++--- arch/x86/xen/enlighten_pvh.c | 3 +- arch/x86/xen/irq.c | 10 ++-- arch/x86/xen/mmu_pv.c| 6 +-- arch/x86/xen/pci-swiotlb-xen.c | 1 + arch/x86/xen/setup.c | 1 + arch/x86/xen/smp.c | 9 +++- arch/x86/xen/smp_hvm.c | 17 +++--- arch/x86/xen/smp_pv.c| 12 ++--- arch/x86/xen/time.c | 23 arch/x86/xen/xen-ops.h | 5 +- drivers/xen/events/events_base.c | 14 ++--- drivers/xen/events/events_fifo.c | 2 +- drivers/xen/evtchn.c | 2 +- drivers/xen/time.c | 2 +- include/xen/xen-ops.h| 7 +-- include/xen/xenhost.h| 47 19 files changed, 240 insertions(+), 161 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 20e0de82..0dafbbc838ef 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,35 +20,6 @@ #include "smp.h" #include "pmu.h" -/* - * Pointer to the xen_vcpu_info structure or - * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info - * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info - * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point - * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to - * acknowledge pending events. - * Also more subtly it is used by the patched version of irq enable/disable - * e.g. xen_irq_enable_direct and xen_iret in PV mode. - * - * The desire to be able to do those mask/unmask operations as a single - * instruction by using the per-cpu offset held in %gs is the real reason - * vcpu info is in a per-cpu pointer and the original reason for this - * hypercall. - * - */ -DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); - -/* - * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info - * hypercall. This can be used both in PV and PVHVM mode. The structure - * overrides the default per_cpu(xen_vcpu, cpu) value. - */ -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); - -/* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(uint32_t, xen_vcpu_id); -EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); - enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); @@ -112,12 +83,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static int xen_vcpu_setup_restore(int cpu) +static int xen_vcpu_setup_restore(xenhost_t *xh, int cpu) { int rc = 0; /* Any per_cpu(xen_vcpu) is stale, so reset it */ - xen_vcpu_info_reset(cpu); + xen_vcpu_info_reset(xh, cpu); /* * For PVH and PVHVM, setup online VCPUs only. The rest will @@ -125,7 +96,7 @@ static int xen_vcpu_setup_restore(int cpu) */ if (xen_pv_domain() || (xen_hvm_domain() && cpu_online(cpu))) { - rc = xen_vcpu_setup(cpu); + rc = xen_vcpu_setup(xh, cpu); } return rc; @@ -138,30 +109,42 @@ static int xen_vcpu_setup_restore(int cpu) */ void xen_vcpu_restore(void) { - int cpu, rc; + int cpu, rc = 0; + /* +* VCPU management is primarily the responsibility of xh_default and +* xh_remote only needs VCPUOP_register_vcpu_info. +* So, we do VPUOP_down and VCPUOP_up only on xh_default. +* +* (Currently, however, VCPUOP_register_vcpu_info is allowed only +* on VCPUs that are self or down, so we might need a new model +* there.) +*/ for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); bool is_up; + xenhost_t **xh; - if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID) + if (xen_vcpu_nr(xh_default, cpu) == XEN_VCPU_ID_INVALID) continue; /* Only Xen 4.5 and higher support this. */
[RFC PATCH 08/16] x86/xen: irq/upcall handling with multiple xenhosts
For configurations with multiple xenhosts, we need to handle events generated from multiple xenhosts. Having more than one upcall handler might be quite hairy, and it would be simpler if the callback from L0-Xen could be bounced via L1-Xen. This will also mean simpler pv_irq_ops code because now the IF flag maps onto the xh_default->vcpu_info->evtchn_upcall_mask. However, we still update the xh_remote->vcpu_info->evtchn_upcall_mask on a best effort basis to minimize unnecessary work in remote xenhost. TODO: - direct pv_ops.irq are disabled. Signed-off-by: Ankur Arora --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten_pv.c | 4 ++- arch/x86/xen/irq.c | 69 + arch/x86/xen/smp_pv.c | 11 ++ 4 files changed, 70 insertions(+), 16 deletions(-) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 564b4dddbc15..3c7056ad3520 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -34,7 +34,7 @@ obj-$(CONFIG_XEN_PV) += enlighten_pv.o obj-$(CONFIG_XEN_PV) += mmu_pv.o obj-$(CONFIG_XEN_PV) += irq.o obj-$(CONFIG_XEN_PV) += multicalls.o -obj-$(CONFIG_XEN_PV) += xen-asm.o +obj-n += xen-asm.o obj-$(CONFIG_XEN_PV) += xen-asm_$(BITS).o obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5f6a1475ec0c..77b1a0d4aef2 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -996,8 +996,9 @@ void __init xen_setup_vcpu_info_placement(void) * xen_vcpu_setup managed to place the vcpu_info within the * percpu area for all cpus, so make use of it. */ +#if 0 + /* Disable direct access for now. */ if (xen_have_vcpu_info_placement && false) { - /* Disable direct access until we have proper pcpu data structures. */ pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_ops.irq.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); @@ -1007,6 +1008,7 @@ void __init xen_setup_vcpu_info_placement(void) __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); pv_ops.mmu.read_cr2 = xen_read_cr2_direct; } +#endif } static const struct pv_info xen_info __initconst = { diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 38ad1a1c4763..f760a6abfb1e 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -19,9 +19,9 @@ * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ -void xen_force_evtchn_callback(void) +void xen_force_evtchn_callback(xenhost_t *xh) { - (void)HYPERVISOR_xen_version(0, NULL); + (void)hypervisor_xen_version(xh, 0, NULL); } asmlinkage __visible unsigned long xen_save_fl(void) @@ -29,6 +29,21 @@ asmlinkage __visible unsigned long xen_save_fl(void) struct vcpu_info *vcpu; unsigned long flags; + /* +* In scenarios with more than one xenhost, the primary xenhost +* is responsible for all the upcalls, with the remote xenhost +* bouncing its upcalls through it (see comment in +* cpu_initialize_context().) +* +* To minimize unnecessary upcalls, the remote xenhost still looks at +* the value of vcpu_info->evtchn_upcall_mask, so we still set and reset +* that. +* +* The fact that the upcall itself is gated by the default xenhost, +* also helps in simplifying the logic here because we don't have to +* worry about guaranteeing atomicity with updates to +* xh_remote->vcpu_info->evtchn_upcall_mask. +*/ vcpu = xh_default->xen_vcpu[smp_processor_id()]; /* flag has opposite sense of mask */ @@ -38,26 +53,34 @@ asmlinkage __visible unsigned long xen_save_fl(void) -0 -> 0x -1 -> 0x */ - return (-flags) & X86_EFLAGS_IF; + return ((-flags) & X86_EFLAGS_IF); } PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); __visible void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; + xenhost_t **xh; /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); - vcpu = xh_default->xen_vcpu[smp_processor_id()]; - vcpu->evtchn_upcall_mask = flags; + for_each_xenhost(xh) { + vcpu = (*xh)->xen_vcpu[smp_processor_id()]; + vcpu->evtchn_upcall_mask = flags; + } if (flags == 0) { barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) -
[RFC PATCH 09/16] xen/evtchn: support evtchn in xenhost_t
Largely mechanical patch that adds a new param, xenhost_t * to the evtchn interfaces. The evtchn port instead of being domain unique, is now scoped to xenhost_t. As part of upcall handling we now look at all the xenhosts and, for evtchn_2l, the xenhost's shared_info and vcpu_info. Other than this event handling is largley unchanged. Note that the IPI, timer, VIRQ, FUNCTION, PMU etc vectors remain attached to xh_default. Only interdomain evtchns are allowable as xh_remote. TODO: - to minimize the changes, evtchn FIFO is disabled for now. Signed-off-by: Ankur Arora --- arch/x86/pci/xen.c | 16 +- arch/x86/xen/enlighten_hvm.c | 2 +- arch/x86/xen/irq.c | 2 +- arch/x86/xen/smp.c | 16 +- arch/x86/xen/smp_pv.c | 4 +- arch/x86/xen/time.c| 5 +- arch/x86/xen/xen-ops.h | 1 + arch/x86/xen/xenhost.c | 16 + drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/input/misc/xen-kbdfront.c | 2 +- drivers/net/xen-netback/interface.c| 8 +- drivers/net/xen-netfront.c | 6 +- drivers/pci/xen-pcifront.c | 2 +- drivers/xen/acpi.c | 2 + drivers/xen/balloon.c | 2 +- drivers/xen/events/Makefile| 1 - drivers/xen/events/events_2l.c | 188 +- drivers/xen/events/events_base.c | 379 - drivers/xen/events/events_fifo.c | 2 +- drivers/xen/events/events_internal.h | 78 ++--- drivers/xen/evtchn.c | 22 +- drivers/xen/fallback.c | 1 + drivers/xen/gntalloc.c | 8 +- drivers/xen/gntdev.c | 8 +- drivers/xen/mcelog.c | 2 +- drivers/xen/pcpu.c | 2 +- drivers/xen/preempt.c | 1 + drivers/xen/privcmd.c | 1 + drivers/xen/sys-hypervisor.c | 2 +- drivers/xen/time.c | 2 +- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xen-scsiback.c | 5 +- drivers/xen/xenbus/xenbus_client.c | 2 +- drivers/xen/xenbus/xenbus_comms.c | 6 +- drivers/xen/xenbus/xenbus_probe.c | 1 + drivers/xen/xenbus/xenbus_probe_backend.c | 1 + drivers/xen/xenbus/xenbus_probe_frontend.c | 1 + drivers/xen/xenbus/xenbus_xs.c | 1 + include/xen/events.h | 45 +-- include/xen/xenhost.h | 17 + 41 files changed, 483 insertions(+), 383 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index d1a3b9f08289..9aa591b5fa3b 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -19,6 +19,8 @@ #include #include +#include +#include #include #include @@ -46,7 +48,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) if (gsi < nr_legacy_irqs()) share = 0; - rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); + rc = xen_bind_pirq_gsi_to_irq(xh_default, gsi, pirq, share, "pcifront"); if (rc < 0) { dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", gsi, pirq, rc); @@ -96,7 +98,7 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, if (gsi_override >= 0) gsi = gsi_override; - irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); + irq = xen_bind_pirq_gsi_to_irq(xh_default, gsi, map_irq.pirq, shareable, name); if (irq < 0) goto out; @@ -180,7 +182,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) goto error; i = 0; for_each_pci_msi_entry(msidesc, dev) { - irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], + irq = xen_bind_pirq_msi_to_irq(xh_default, dev, msidesc, v[i], (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? "pcifront-msi-x" : @@ -234,7 +236,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 1; for_each_pci_msi_entry(msidesc, dev) { - pirq = xen_allocate_pirq_msi(dev, msidesc); + pirq = xen_allocate_pirq_msi(xh_default, dev, msidesc); if (pirq < 0) { irq = -ENODEV; goto error; @@ -242,7 +244,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int
[RFC PATCH 12/16] xen/xenbus: support xenbus frontend/backend with xenhost_t
As part of xenbus init, both frontend, backend interfaces need to talk on the correct xenbus. This might be a local xenstore (backend) or might be a XS_PV/XS_HVM interface (frontend) which needs to talk over xenbus with the remote xenstored. We bootstrap all of these with evtchn/gfn parameters from (*setup_xs)(). Given this we can do appropriate device discovery (in case of frontend) and device connectivity for the backend. Once done, we stash the xenhost_t * in xen_bus_type, xenbus_device or xenbus_watch and then the frontend and backend devices implicitly use the correct interface. The rest of patch is just changing the interfaces where needed. Signed-off-by: Ankur Arora --- drivers/block/xen-blkback/blkback.c| 10 +- drivers/net/xen-netfront.c | 14 +- drivers/pci/xen-pcifront.c | 4 +- drivers/xen/cpu_hotplug.c | 4 +- drivers/xen/manage.c | 28 +-- drivers/xen/xen-balloon.c | 8 +- drivers/xen/xenbus/xenbus.h| 45 ++-- drivers/xen/xenbus/xenbus_client.c | 32 +-- drivers/xen/xenbus/xenbus_comms.c | 121 +- drivers/xen/xenbus/xenbus_dev_backend.c| 30 ++- drivers/xen/xenbus/xenbus_dev_frontend.c | 22 +- drivers/xen/xenbus/xenbus_probe.c | 246 + drivers/xen/xenbus/xenbus_probe_backend.c | 19 +- drivers/xen/xenbus/xenbus_probe_frontend.c | 65 +++--- drivers/xen/xenbus/xenbus_xs.c | 188 +--- include/xen/xen-ops.h | 3 + include/xen/xenbus.h | 54 +++-- include/xen/xenhost.h | 20 ++ 18 files changed, 536 insertions(+), 377 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index fd1e19f1a49f..7ad4423c24b8 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -541,12 +541,12 @@ static void xen_vbd_resize(struct xen_blkif *blkif) pr_info("VBD Resize: new size %llu\n", new_size); vbd->size = new_size; again: - err = xenbus_transaction_start(&xbt); + err = xenbus_transaction_start(dev->xh, &xbt); if (err) { pr_warn("Error starting transaction\n"); return; } - err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + err = xenbus_printf(dev->xh, xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(vbd)); if (err) { pr_warn("Error writing new size\n"); @@ -557,20 +557,20 @@ static void xen_vbd_resize(struct xen_blkif *blkif) * the front-end. If the current state is "connected" the * front-end will get the new size information online. */ - err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); + err = xenbus_printf(dev->xh, xbt, dev->nodename, "state", "%d", dev->state); if (err) { pr_warn("Error writing the state\n"); goto abort; } - err = xenbus_transaction_end(xbt, 0); + err = xenbus_transaction_end(dev->xh, xbt, 0); if (err == -EAGAIN) goto again; if (err) pr_warn("Error ending transaction\n"); return; abort: - xenbus_transaction_end(xbt, 1); + xenbus_transaction_end(dev->xh, xbt, 1); } /* diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 1cd0a2d2ba54..ee28e8b85406 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1336,9 +1336,9 @@ static struct net_device *xennet_create_dev(struct xenbus_device *dev) xenbus_switch_state(dev, XenbusStateInitialising); wait_event(module_wq, - xenbus_read_driver_state(dev->otherend) != + xenbus_read_driver_state(dev, dev->otherend) != XenbusStateClosed && - xenbus_read_driver_state(dev->otherend) != + xenbus_read_driver_state(dev, dev->otherend) != XenbusStateUnknown); return netdev; @@ -2145,19 +2145,19 @@ static int xennet_remove(struct xenbus_device *dev) dev_dbg(&dev->dev, "%s\n", dev->nodename); - if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) { + if (xenbus_read_driver_state(dev, dev->otherend) != XenbusStateClosed) { xenbus_switch_state(dev, XenbusStateClosing); wait_event(module_wq, - xenbus_read_driver_state(dev->otherend) == + xenbus_read_driver_state(dev, dev->otherend) == XenbusStateClosi
[RFC PATCH 06/16] x86/xen: add shared_info support to xenhost_t
HYPERVISOR_shared_info is used for irq/evtchn communication between the guest and the host. Abstract out the setup/reset in xenhost_t such that nested configurations can use both xenhosts simultaneously. In addition to irq/evtchn communication, shared_info is also used for pvclock and p2m related state. For both of those, remote xenhost is not of interest so we only use the default xenhost. Signed-off-by: Ankur Arora --- arch/x86/include/asm/xen/hypervisor.h | 1 - arch/x86/xen/enlighten.c | 10 ++- arch/x86/xen/enlighten_hvm.c | 38 +- arch/x86/xen/enlighten_pv.c | 28 --- arch/x86/xen/p2m.c| 24 - arch/x86/xen/suspend_hvm.c| 6 - arch/x86/xen/suspend_pv.c | 14 +- arch/x86/xen/time.c | 4 +-- arch/x86/xen/xen-ops.h| 2 -- arch/x86/xen/xenhost.c| 13 - drivers/xen/events/events_2l.c| 16 +-- include/xen/xenhost.h | 39 +++ 12 files changed, 138 insertions(+), 57 deletions(-) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 6c4cdcdf997d..3e6bd455fbd0 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,7 +33,6 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; #include diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f88bb14da3f2..20e0de82 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -72,12 +72,6 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); uint32_t xen_start_flags __attribute__((section(".data"))) = 0; EXPORT_SYMBOL(xen_start_flags); -/* - * Point at some empty memory to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; - /* * Flag to determine whether vcpu info placement is available on all * VCPUs. We assume it is to start with, and then set it to zero on @@ -187,7 +181,7 @@ void xen_vcpu_info_reset(int cpu) { if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + &xh_default->HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; } else { /* Set to NULL so that if somebody accesses it we get an OOPS */ per_cpu(xen_vcpu, cpu) = NULL; @@ -200,7 +194,7 @@ int xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); + BUG_ON(xh_default->HYPERVISOR_shared_info == &xen_dummy_shared_info); /* * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index a118b61a1a8a..0e53363f9d1f 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -26,21 +26,25 @@ #include "mmu.h" #include "smp.h" -static unsigned long shared_info_pfn; - -void xen_hvm_init_shared_info(void) +static void xen_hvm_init_shared_info(xenhost_t *xh) { struct xen_add_to_physmap xatp; xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = shared_info_pfn; - if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + xatp.gpfn = xh->shared_info_pfn; + if (hypervisor_memory_op(xh, XENMEM_add_to_physmap, &xatp)) BUG(); } -static void __init reserve_shared_info(void) +static void xen_hvm_reset_shared_info(xenhost_t *xh) +{ + early_memunmap(xh->HYPERVISOR_shared_info, PAGE_SIZE); + xh->HYPERVISOR_shared_info = __va(PFN_PHYS(xh->shared_info_pfn)); +} + +static void __init reserve_shared_info(xenhost_t *xh) { u64 pa; @@ -58,16 +62,18 @@ static void __init reserve_shared_info(void) pa += PAGE_SIZE) ; - shared_info_pfn = PHYS_PFN(pa); + xh->shared_info_pfn = PHYS_PFN(pa); memblock_reserve(pa, PAGE_SIZE); - HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE); + xh->HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE); } static void __init xen_hvm_init_mem_mapping(void) { - early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE); - HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); + xenhost_t **xh; + + for_each_xenhost(xh) + xenhost_reset_shared_info(*xh); /* * The virtual address of the shared_info page has changed, so @@ -79,6 +85,7 @@ static void __
[RFC PATCH 01/16] x86/xen: add xenhost_t interface
Add xenhost_t which will serve as an abstraction over Xen interfaces. It co-exists with the PV/HVM/PVH abstractions (x86_init, hypervisor_x86, pv_ops etc) and is meant to capture mechanisms for communication with Xen so we could have different types of underlying Xen: regular, local, and nested. Also add xenhost_register() and stub registration in the various guest types. Signed-off-by: Ankur Arora --- arch/x86/xen/Makefile| 1 + arch/x86/xen/enlighten_hvm.c | 13 + arch/x86/xen/enlighten_pv.c | 16 ++ arch/x86/xen/enlighten_pvh.c | 12 + arch/x86/xen/xenhost.c | 75 include/xen/xen.h| 3 ++ include/xen/xenhost.h| 95 7 files changed, 215 insertions(+) create mode 100644 arch/x86/xen/xenhost.c create mode 100644 include/xen/xenhost.h diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 084de77a109e..564b4dddbc15 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,6 +18,7 @@ obj-y += mmu.o obj-y += time.o obj-y += grant-table.o obj-y += suspend.o +obj-y += xenhost.o obj-$(CONFIG_XEN_PVHVM)+= enlighten_hvm.o obj-$(CONFIG_XEN_PVHVM)+= mmu_hvm.o diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 0e75642d42a3..100452f4f44c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -82,6 +83,12 @@ static void __init xen_hvm_init_mem_mapping(void) xen_vcpu_info_reset(0); } +xenhost_ops_t xh_hvm_ops = { +}; + +xenhost_ops_t xh_hvm_nested_ops = { +}; + static void __init init_hvm_pv_info(void) { int major, minor; @@ -179,6 +186,12 @@ static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) return; + /* +* We need only xenhost_r1 for HVM guests since they cannot be +* driver domain (?) or dom0. +*/ + if (!xen_pvh_domain()) + xenhost_register(xenhost_r1, &xh_hvm_ops); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index c54a493e139a..bb6e811c1525 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -1188,6 +1189,12 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +xenhost_ops_t xh_pv_ops = { +}; + +xenhost_ops_t xh_pv_nested_ops = { +}; + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1198,6 +1205,15 @@ asmlinkage __visible void __init xen_start_kernel(void) if (!xen_start_info) return; + xenhost_register(xenhost_r1, &xh_pv_ops); + + /* +* Detect in some implementation defined manner whether this is +* nested or not. +*/ + if (xen_driver_domain() && xen_nested()) + xenhost_register(xenhost_r2, &xh_pv_nested_ops); + xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 35b7599d2d0b..826c296d27a3 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -21,11 +22,22 @@ */ bool xen_pvh __attribute__((section(".data"))) = 0; +extern xenhost_ops_t xh_hvm_ops, xh_hvm_nested_ops; + void __init xen_pvh_init(void) { u32 msr; u64 pfn; + xenhost_register(xenhost_r1, &xh_hvm_ops); + + /* +* Detect in some implementation defined manner whether this is +* nested or not. +*/ + if (xen_driver_domain() && xen_nested()) + xenhost_register(xenhost_r2, &xh_hvm_nested_ops); + xen_pvh = 1; xen_start_flags = pvh_start_info.flags; diff --git a/arch/x86/xen/xenhost.c b/arch/x86/xen/xenhost.c new file mode 100644 index ..ca90acd7687e --- /dev/null +++ b/arch/x86/xen/xenhost.c @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +xenhost_t xenhosts[2]; +/* + * xh_default: interface to the regular hypervisor. xenhost_type is xenhost_r0 + * or xenhost_r1. + * + * xh_remote: interface to remote hypervisor. Needed for PV driver support on + * L1-dom0/driver-domain for nested Xen. xenhost_type is xenhost_r2. + */ +xenhost_t *xh_default = (xenhost_t *) &xenhosts[0]; +xenhost_t *xh_remote = (xenhost_t *) &xenhosts[1]; + +/* + * Exported for use of for_each_xenhost(). + */ +EXPORT_SYMBOL_GPL(xenhosts); + +/* + * Some places refer directly to a specific ty
[RFC PATCH 11/16] xen/grant-table: make grant-table xenhost aware
Largely mechanical changes: the exported grant table symbols now take xenhost_t * as a parameter. Also, move the grant table global state inside xenhost_t. If there's more than one xenhost, then initialize both. Signed-off-by: Ankur Arora --- arch/x86/xen/grant-table.c | 71 +++-- drivers/xen/grant-table.c | 611 + include/xen/grant_table.h | 72 ++--- include/xen/xenhost.h | 11 + 4 files changed, 443 insertions(+), 322 deletions(-) diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index ecb0d5450334..8f4b071427f9 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -23,48 +23,54 @@ #include -static struct gnttab_vm_area { +struct gnttab_vm_area { struct vm_struct *area; pte_t **ptes; -} gnttab_shared_vm_area, gnttab_status_vm_area; +}; -int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, - unsigned long max_nr_gframes, - void **__shared) +int arch_gnttab_map_shared(xenhost_t *xh, unsigned long *frames, + unsigned long nr_gframes, + unsigned long max_nr_gframes, + void **__shared) { void *shared = *__shared; unsigned long addr; unsigned long i; if (shared == NULL) - *__shared = shared = gnttab_shared_vm_area.area->addr; + *__shared = shared = ((struct gnttab_vm_area *) + xh->gnttab_shared_vm_area)->area->addr; addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i], - mfn_pte(frames[i], PAGE_KERNEL)); + set_pte_at(&init_mm, addr, + ((struct gnttab_vm_area *) xh->gnttab_shared_vm_area)->ptes[i], + mfn_pte(frames[i], PAGE_KERNEL)); addr += PAGE_SIZE; } return 0; } -int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, - unsigned long max_nr_gframes, - grant_status_t **__shared) +int arch_gnttab_map_status(xenhost_t *xh, uint64_t *frames, + unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) { grant_status_t *shared = *__shared; unsigned long addr; unsigned long i; if (shared == NULL) - *__shared = shared = gnttab_status_vm_area.area->addr; + *__shared = shared = ((struct gnttab_vm_area *) + xh->gnttab_status_vm_area)->area->addr; addr = (unsigned long)shared; for (i = 0; i < nr_gframes; i++) { - set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i], + set_pte_at(&init_mm, addr, ((struct gnttab_vm_area *) + xh->gnttab_status_vm_area)->ptes[i], mfn_pte(frames[i], PAGE_KERNEL)); addr += PAGE_SIZE; } @@ -72,16 +78,17 @@ int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, return 0; } -void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) +void arch_gnttab_unmap(xenhost_t *xh, void *shared, unsigned long nr_gframes) { pte_t **ptes; unsigned long addr; unsigned long i; - if (shared == gnttab_status_vm_area.area->addr) - ptes = gnttab_status_vm_area.ptes; + if (shared == ((struct gnttab_vm_area *) + xh->gnttab_status_vm_area)->area->addr) + ptes = ((struct gnttab_vm_area *) xh->gnttab_status_vm_area)->ptes; else - ptes = gnttab_shared_vm_area.ptes; + ptes = ((struct gnttab_vm_area *) xh->gnttab_shared_vm_area)->ptes; addr = (unsigned long)shared; @@ -112,14 +119,15 @@ static void arch_gnttab_vfree(struct gnttab_vm_area *area) kfree(area->ptes); } -int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) +int arch_gnttab_init(xenhost_t *xh, unsigned long nr_shared, unsigned long nr_status) { int ret; if (!xen_pv_domain()) return 0; - ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared); + ret = arch_gnttab_valloc((struct gnttab_vm_area *) + xh->gnttab_shared_vm_area, nr_shared); if (ret < 0) return ret; @@ -127,13 +135,15 @@ int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status) * Always allocate the space for the status frames in case * we're migrated
[RFC PATCH 04/16] x86/xen: hypercall support for xenhost_t
Allow for different hypercall implementations for different xenhost types. Nested xenhost, which has two underlying xenhosts, can use both simultaneously. The hypercall macros (HYPERVISOR_*) implicitly use the default xenhost.x A new macro (hypervisor_*) takes xenhost_t * as a parameter and does the right thing. TODO: - Multicalls for now assume the default xenhost - xen_hypercall_* symbols are only generated for the default xenhost. Signed-off-by: Ankur Arora --- arch/x86/include/asm/xen/hypercall.h | 233 ++- arch/x86/xen/enlighten.c | 3 - arch/x86/xen/enlighten_hvm.c | 23 ++- arch/x86/xen/enlighten_pv.c | 13 +- arch/x86/xen/enlighten_pvh.c | 9 +- arch/x86/xen/xen-head.S | 3 + drivers/xen/fallback.c | 8 +- include/xen/xenhost.h| 23 +++ 8 files changed, 218 insertions(+), 97 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 1a3cd6680e6f..e138f9c36a5a 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -51,6 +51,7 @@ #include #include #include +#include struct xen_dm_op_buf; @@ -88,11 +89,11 @@ struct xen_dm_op_buf; struct hypercall_entry { char _entry[32]; }; extern struct hypercall_entry xen_hypercall_page[128]; -extern struct hypercall_entry *hypercall_page; +extern struct hypercall_entry xen_hypercall_page2[128]; #define __HYPERCALLCALL_NOSPEC -#define __HYPERCALL_ENTRY(x) \ - [thunk_target] "0" (hypercall_page + __HYPERVISOR_##x) +#define __HYPERCALL_ENTRY(xh, x) \ + [thunk_target] "0" (xh->hypercall_page + __HYPERVISOR_##x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -144,57 +145,57 @@ extern struct hypercall_entry *hypercall_page; #define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG #define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG -#define _hypercall0(type, name) \ +#define _hypercall0(xh, type, name)\ ({ \ __HYPERCALL_DECLS; \ __HYPERCALL_0ARG(); \ asm volatile (__HYPERCALL \ : __HYPERCALL_0PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(xh, name) \ : __HYPERCALL_CLOBBER0); \ (type)__res;\ }) -#define _hypercall1(type, name, a1)\ +#define _hypercall1(xh, type, name, a1) \ ({ \ __HYPERCALL_DECLS; \ __HYPERCALL_1ARG(a1); \ asm volatile (__HYPERCALL \ : __HYPERCALL_1PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(xh, name) \ : __HYPERCALL_CLOBBER1); \ (type)__res;\ }) -#define _hypercall2(type, name, a1, a2) \ +#define _hypercall2(xh, type, name, a1, a2)\ ({ \ __HYPERCALL_DECLS; \ __HYPERCALL_2ARG(a1, a2); \ asm volatile (__HYPERCALL \ : __HYPERCALL_2PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(xh, name) \ : __HYPERCALL_CLOBBER2); \ (type)__res;\ }) -#define _hypercall3(type, name, a1, a2, a3)\ +#define _hypercall3(xh, type, name, a1, a2, a3) \ ({ \ __HYPERCALL_DECLS; \ __HYPERCALL_3ARG(a1, a2, a3); \
[RFC PATCH 05/16] x86/xen: add feature support in xenhost_t
With nested xenhosts, both the xenhosts could have different supported xen_features. Add support for probing both. In addition, validate that features are compatible across xenhosts. For runtime feature checking, the code uses xen_feature() with the default xenhost. This should be good enough because we do feature validation early which guarantees that the features of interest are compatible. Features not of interest, are related to MMU, clock, pirq, etc where the interface to L0-Xen should not matter. Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten_hvm.c | 15 +++ arch/x86/xen/enlighten_pv.c | 14 ++ drivers/xen/features.c | 33 +++-- include/xen/features.h | 17 ++--- include/xen/xenhost.h| 10 ++ 5 files changed, 72 insertions(+), 17 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index f84941d6944e..a118b61a1a8a 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -119,17 +119,24 @@ static void __init init_hvm_pv_info(void) xen_domain_type = XEN_HVM_DOMAIN; - /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { + else pv_info.name = "Xen HVM"; - for_each_xenhost(xh) + for_each_xenhost(xh) { + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (!xen_pvh_domain()) xenhost_setup_hypercall_page(*xh); + xen_setup_features(*xh); } - xen_setup_features(); + /* +* Check if features are compatible across L1-Xen and L0-Xen; +* If not, get rid of xenhost_r2. +*/ + if (xen_validate_features() == false) + __xenhost_unregister(xenhost_r2); cpuid(base + 4, &eax, &ebx, &ecx, &edx); if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index a2c07cc71498..484968ff16a4 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1236,13 +1236,19 @@ asmlinkage __visible void __init xen_start_kernel(void) if (xen_driver_domain() && xen_nested()) xenhost_register(xenhost_r2, &xh_pv_nested_ops); - for_each_xenhost(xh) - xenhost_setup_hypercall_page(*xh); - xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; - xen_setup_features(); + for_each_xenhost(xh) { + xenhost_setup_hypercall_page(*xh); + xen_setup_features(*xh); + } + /* +* Check if features are compatible across L1-Xen and L0-Xen; +* If not, get rid of xenhost_r2. +*/ + if (xen_validate_features() == false) + __xenhost_unregister(xenhost_r2); /* Install Xen paravirt ops */ pv_info = xen_info; diff --git a/drivers/xen/features.c b/drivers/xen/features.c index d7d34fdfc993..b4fba808ebae 100644 --- a/drivers/xen/features.c +++ b/drivers/xen/features.c @@ -15,19 +15,40 @@ #include #include -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) +void xen_setup_features(xenhost_t *xh) { struct xen_feature_info fi; int i, j; for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + if (hypervisor_xen_version(xh, XENVER_get_features, &fi) < 0) break; for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<features[i * 32 + j] = !!(fi.submap & 1<features and xh_remote->features for +* compatibility. Relevant features should be compatible +* or we are asking for trouble. +*/ + fail += __xen_feature(xh_default, XENFEAT_auto_translated_physmap) != + __xen_feature(xh_remote, XENFEAT_auto_translated_physmap); + + /* We would like callbacks via hvm_callback_vector. */ + fail += __xen_feature(xh_default, XENFEAT_hvm_callback_vector) == 0; + fail += __xen_feature(xh_remote, XENFEAT_hvm_callback_vector) == 0; + + if (fail) + return false; + } + + return fail ? false : true; +} diff --git a/include/xen/features.h b/include/xen/features.h index e4cb464386a9..63e6735ed6a3 100644 --- a/include/xen/features.h +++ b/include/xen/features.h @@ -11,14 +11,25 @@ #define __XEN_FEATURES_H__ #include +#include -void xen_setup_features(void); +void xen_setup_features(xenhost_t *xh); -extern u8 xe
Re: [RFC PATCH 10/16] xen/balloon: support ballooning in xenhost_t
On 6/17/19 2:28 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Xen ballooning uses hollow struct pages (with the underlying GFNs being populated/unpopulated via hypercalls) which are used by the grant logic to map grants from other domains. This patch allows the default xenhost to provide an alternate ballooning allocation mechanism. This is expected to be useful for local xenhosts (type xenhost_r0) because unlike Xen, where there is an external hypervisor which can change the memory underneath a GFN, that is not possible when the hypervisor is running in the same address space as the entity doing the ballooning. Co-developed-by: Ankur Arora Signed-off-by: Joao Martins Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten_hvm.c | 7 +++ arch/x86/xen/enlighten_pv.c | 8 drivers/xen/balloon.c | 19 --- drivers/xen/grant-table.c | 4 ++-- drivers/xen/privcmd.c | 4 ++-- drivers/xen/xen-selfballoon.c | 2 ++ drivers/xen/xenbus/xenbus_client.c | 6 +++--- drivers/xen/xlate_mmu.c | 4 ++-- include/xen/balloon.h | 4 ++-- include/xen/xenhost.h | 19 +++ 10 files changed, 63 insertions(+), 14 deletions(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5ef4d6ad920d..08becf574743 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -63,6 +63,7 @@ #include #include +#include #include #include @@ -583,12 +584,21 @@ static int add_ballooned_pages(int nr_pages) * @pages: pages returned * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages) +int alloc_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages) { int pgno = 0; struct page *page; int ret; + /* + * xenmem transactions for remote xenhost are disallowed. + */ + if (xh->type == xenhost_r2) + return -EINVAL; Why don't you set a dummy function returning -EINVAL into the xenhost_r2 structure instead? Will do. And, same for the two comments below. Ankur + + if (xh->ops->alloc_ballooned_pages) + return xh->ops->alloc_ballooned_pages(xh, nr_pages, pages); + Please make alloc_xenballooned_pages() an inline wrapper and use the current implmentaion as the default. This avoids another if (). mutex_lock(&balloon_mutex); balloon_stats.target_unpopulated += nr_pages; @@ -620,7 +630,7 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages) return 0; out_undo: mutex_unlock(&balloon_mutex); - free_xenballooned_pages(pgno, pages); + free_xenballooned_pages(xh, pgno, pages); return ret; } EXPORT_SYMBOL(alloc_xenballooned_pages); @@ -630,10 +640,13 @@ EXPORT_SYMBOL(alloc_xenballooned_pages); * @nr_pages: Number of pages * @pages: pages to return */ -void free_xenballooned_pages(int nr_pages, struct page **pages) +void free_xenballooned_pages(xenhost_t *xh, int nr_pages, struct page **pages) { int i; + if (xh->ops->free_ballooned_pages) + return xh->ops->free_ballooned_pages(xh, nr_pages, pages); + Same again: please use an inline wrapper. Juergen
Re: [RFC PATCH 11/16] xen/grant-table: make grant-table xenhost aware
On 6/17/19 2:36 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Largely mechanical changes: the exported grant table symbols now take xenhost_t * as a parameter. Also, move the grant table global state inside xenhost_t. If there's more than one xenhost, then initialize both. Signed-off-by: Ankur Arora --- arch/x86/xen/grant-table.c | 71 +++-- drivers/xen/grant-table.c | 611 + include/xen/grant_table.h | 72 ++--- include/xen/xenhost.h | 11 + 4 files changed, 443 insertions(+), 322 deletions(-) diff --git a/include/xen/xenhost.h b/include/xen/xenhost.h index 9e08627a9e3e..acee0c7872b6 100644 --- a/include/xen/xenhost.h +++ b/include/xen/xenhost.h @@ -129,6 +129,17 @@ typedef struct { const struct evtchn_ops *evtchn_ops; int **evtchn_to_irq; }; + + /* grant table private state */ + struct { + /* private to drivers/xen/grant-table.c */ + void *gnttab_private; + + /* x86/xen/grant-table.c */ + void *gnttab_shared_vm_area; + void *gnttab_status_vm_area; + void *auto_xlat_grant_frames; Please use proper types here instead of void *. This avoids lots of casts. It is okay to just add anonymous struct definitions and keep the real struct layout local to grant table code. Will fix. Ankur Juergen
Re: [RFC PATCH 12/16] xen/xenbus: support xenbus frontend/backend with xenhost_t
On 6/17/19 2:50 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: As part of xenbus init, both frontend, backend interfaces need to talk on the correct xenbus. This might be a local xenstore (backend) or might be a XS_PV/XS_HVM interface (frontend) which needs to talk over xenbus with the remote xenstored. We bootstrap all of these with evtchn/gfn parameters from (*setup_xs)(). Given this we can do appropriate device discovery (in case of frontend) and device connectivity for the backend. Once done, we stash the xenhost_t * in xen_bus_type, xenbus_device or xenbus_watch and then the frontend and backend devices implicitly use the correct interface. The rest of patch is just changing the interfaces where needed. Signed-off-by: Ankur Arora --- drivers/block/xen-blkback/blkback.c | 10 +- drivers/net/xen-netfront.c | 14 +- drivers/pci/xen-pcifront.c | 4 +- drivers/xen/cpu_hotplug.c | 4 +- drivers/xen/manage.c | 28 +-- drivers/xen/xen-balloon.c | 8 +- drivers/xen/xenbus/xenbus.h | 45 ++-- drivers/xen/xenbus/xenbus_client.c | 32 +-- drivers/xen/xenbus/xenbus_comms.c | 121 +- drivers/xen/xenbus/xenbus_dev_backend.c | 30 ++- drivers/xen/xenbus/xenbus_dev_frontend.c | 22 +- drivers/xen/xenbus/xenbus_probe.c | 246 + drivers/xen/xenbus/xenbus_probe_backend.c | 19 +- drivers/xen/xenbus/xenbus_probe_frontend.c | 65 +++--- drivers/xen/xenbus/xenbus_xs.c | 188 +--- include/xen/xen-ops.h | 3 + include/xen/xenbus.h | 54 +++-- include/xen/xenhost.h | 20 ++ 18 files changed, 536 insertions(+), 377 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index c3e201025ef0..d6e0c397c6a0 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -58,10 +58,14 @@ #include #include +#include +#include #include #include "xenbus.h" +static xenhost_t *xh; + /* * An element of a list of outstanding transactions, for which we're * still waiting a reply. @@ -312,13 +316,13 @@ static void xenbus_file_free(struct kref *kref) */ list_for_each_entry_safe(trans, tmp, &u->transactions, list) { - xenbus_transaction_end(trans->handle, 1); + xenbus_transaction_end(xh, trans->handle, 1); list_del(&trans->list); kfree(trans); } list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { - unregister_xenbus_watch(&watch->watch); + unregister_xenbus_watch(xh, &watch->watch); list_del(&watch->list); free_watch_adapter(watch); } @@ -450,7 +454,7 @@ static int xenbus_write_transaction(unsigned msg_type, (!strcmp(msg->body, "T") || !strcmp(msg->body, "F" return xenbus_command_reply(u, XS_ERROR, "EINVAL"); - rc = xenbus_dev_request_and_reply(&msg->hdr, u); + rc = xenbus_dev_request_and_reply(xh, &msg->hdr, u); if (rc && trans) { list_del(&trans->list); kfree(trans); @@ -489,7 +493,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) watch->watch.callback = watch_fired; watch->dev_data = u; - err = register_xenbus_watch(&watch->watch); + err = register_xenbus_watch(xh, &watch->watch); if (err) { free_watch_adapter(watch); rc = err; @@ -500,7 +504,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) list_for_each_entry(watch, &u->watches, list) { if (!strcmp(watch->token, token) && !strcmp(watch->watch.node, path)) { - unregister_xenbus_watch(&watch->watch); + unregister_xenbus_watch(xh, &watch->watch); list_del(&watch->list); free_watch_adapter(watch); break; @@ -618,8 +622,9 @@ static ssize_t xenbus_file_write(struct file *filp, static int xenbus_file_open(struct inode *inode, struct file *filp) { struct xenbus_file_priv *u; + struct xenstore_private *xs = xs_priv(xh); - if (xen_store_evtchn == 0) + if (xs->store_evtchn == 0) return -ENOENT; nonseekable_open(inode, filp); @@ -687,6 +692,11 @@ static int __init xenbus_init(void) if (!xen_domain()) return -ENODEV; + if (xen_driver_domain() && xen_nested()) + xh = xh_remote; + else + xh = xh_default; This precludes any mixed use of L0 and L1 frontends. With this
Re: [RFC PATCH 13/16] drivers/xen: gnttab, evtchn, xenbus API changes
On 6/17/19 3:07 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Mechanical changes, now most of these calls take xenhost_t * as parameter. Co-developed-by: Joao Martins Signed-off-by: Ankur Arora --- drivers/xen/cpu_hotplug.c | 14 ++--- drivers/xen/gntalloc.c | 13 drivers/xen/gntdev.c | 16 +++ drivers/xen/manage.c | 37 ++- drivers/xen/platform-pci.c | 12 +++- drivers/xen/sys-hypervisor.c | 12 drivers/xen/xen-balloon.c | 10 +++--- drivers/xen/xenfs/xenstored.c | 7 --- 8 files changed, 73 insertions(+), 48 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index afeb94446d34..4a05bc028956 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -31,13 +31,13 @@ static void disable_hotplug_cpu(int cpu) unlock_device_hotplug(); } -static int vcpu_online(unsigned int cpu) +static int vcpu_online(xenhost_t *xh, unsigned int cpu) Do we really need xenhost for cpu on/offlinig? I was in two minds about this. We only need it for the xenbus interfaces which could very well have been just xh_default. However, the xenhost is part of the xenbus_watch state, so I thought it is easier to percolate that down instead of adding xh_default all over the place. diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 9a69d955dd5c..1655d0a039fd 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -227,14 +227,14 @@ static void shutdown_handler(struct xenbus_watch *watch, return; again: - err = xenbus_transaction_start(xh_default, &xbt); + err = xenbus_transaction_start(watch->xh, &xbt); if (err) return; - str = (char *)xenbus_read(xh_default, xbt, "control", "shutdown", NULL); + str = (char *)xenbus_read(watch->xh, xbt, "control", "shutdown", NULL); /* Ignore read errors and empty reads. */ if (XENBUS_IS_ERR_READ(str)) { - xenbus_transaction_end(xh_default, xbt, 1); + xenbus_transaction_end(watch->xh, xbt, 1); return; } @@ -245,9 +245,9 @@ static void shutdown_handler(struct xenbus_watch *watch, /* Only acknowledge commands which we are prepared to handle. */ if (idx < ARRAY_SIZE(shutdown_handlers)) - xenbus_write(xh_default, xbt, "control", "shutdown", ""); + xenbus_write(watch->xh, xbt, "control", "shutdown", ""); - err = xenbus_transaction_end(xh_default, xbt, 0); + err = xenbus_transaction_end(watch->xh, xbt, 0); if (err == -EAGAIN) { kfree(str); goto again; @@ -272,10 +272,10 @@ static void sysrq_handler(struct xenbus_watch *watch, const char *path, int err; again: - err = xenbus_transaction_start(xh_default, &xbt); + err = xenbus_transaction_start(watch->xh, &xbt); if (err) return; - err = xenbus_scanf(xh_default, xbt, "control", "sysrq", "%c", &sysrq_key); + err = xenbus_scanf(watch->xh, xbt, "control", "sysrq", "%c", &sysrq_key); if (err < 0) { /* * The Xenstore watch fires directly after registering it and @@ -287,21 +287,21 @@ static void sysrq_handler(struct xenbus_watch *watch, const char *path, if (err != -ENOENT && err != -ERANGE) pr_err("Error %d reading sysrq code in control/sysrq\n", err); - xenbus_transaction_end(xh_default, xbt, 1); + xenbus_transaction_end(watch->xh, xbt, 1); return; } if (sysrq_key != '\0') { - err = xenbus_printf(xh_default, xbt, "control", "sysrq", "%c", '\0'); + err = xenbus_printf(watch->xh, xbt, "control", "sysrq", "%c", '\0'); if (err) { pr_err("%s: Error %d writing sysrq in control/sysrq\n", __func__, err); - xenbus_transaction_end(xh_default, xbt, 1); + xenbus_transaction_end(watch->xh, xbt, 1); return; } } - err = xenbus_transaction_end(xh_default, xbt, 0); + err = xenbus_transaction_end(watch->xh, xbt, 0); if (err == -EAGAIN) goto again; @@ -324,14 +324,14 @@ static struct notifier_block xen_reboot_nb = { .notifier_call = poweroff_nb, }; -static int setup_shutdown_watcher(void) +static int setup_shutdown_watcher(xenhost_t *xh) I think shutdown is purely local, too. Yes, I introduced xenhost for the same reason as above. I agree that either of these cases (and similar others) have no use for the concept of xenhost. Do you think it makes sense for these to pass NULL instead and the underlying interface would just assume xh_default. Ankur Juergen
Re: [RFC PATCH 16/16] xen/grant-table: host_addr fixup in mapping on xenhost_r0
On 6/17/19 3:55 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Xenhost type xenhost_r0 does not support standard GNTTABOP_map_grant_ref semantics (map a gref onto a specified host_addr). That's because since the hypervisor is local (same address space as the caller of GNTTABOP_map_grant_ref), there is no external entity that could map an arbitrary page underneath an arbitrary address. To handle this, the GNTTABOP_map_grant_ref hypercall on xenhost_r0 treats the host_addr as an OUT parameter instead of IN and expects the gnttab_map_refs() and similar to fixup any state that caches the value of host_addr from before the hypercall. Accordingly gnttab_map_refs() now adds two parameters, a fixup function and a pointer to cached maps to fixup: int gnttab_map_refs(xenhost_t *xh, struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count) + struct page **pages, gnttab_map_fixup_t map_fixup_fn, + void **map_fixup[], unsigned int count) The reason we use a fixup function and not an additional mapping op in the xenhost_t is because, depending on the caller, what we are fixing might be different: blkback, netback for instance cache host_addr in via a struct page *, while __xenbus_map_ring() caches a phys_addr. This patch fixes up xen-blkback and xen-gntdev drivers. TODO: - also rewrite gnttab_batch_map() and __xenbus_map_ring(). - modify xen-netback, scsiback, pciback etc Co-developed-by: Joao Martins Signed-off-by: Ankur Arora Without seeing the __xenbus_map_ring() modification it is impossible to do a proper review of this patch. Will do in v2. Ankur Juergen
Re: [RFC PATCH 14/16] xen/blk: gnttab, evtchn, xenbus API changes
On 6/17/19 3:14 AM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: For the most part, we now pass xenhost_t * as a parameter. Co-developed-by: Joao Martins Signed-off-by: Ankur Arora I don't see how this can be a patch on its own. Yes, the reason this was separate was that given this was an RFC, I didn't want to pollute the logic page with lots of mechanical changes. The only way to be able to use a patch for each driver would be to keep the original grant-, event- and xenbus-interfaces and add the new ones taking xenhost * with a new name. The original interfaces could then use xenhost_default and you can switch them to the new interfaces one by one. The last patch could then remove the old interfaces when there is no user left. Yes, this makes sense. Ankur Juergen
Re: [RFC PATCH 01/16] x86/xen: add xenhost_t interface
On 2019-06-07 8:04 a.m., Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Add xenhost_t which will serve as an abstraction over Xen interfaces. It co-exists with the PV/HVM/PVH abstractions (x86_init, hypervisor_x86, pv_ops etc) and is meant to capture mechanisms for communication with Xen so we could have different types of underlying Xen: regular, local, and nested. Also add xenhost_register() and stub registration in the various guest types. Signed-off-by: Ankur Arora --- arch/x86/xen/Makefile | 1 + arch/x86/xen/enlighten_hvm.c | 13 + arch/x86/xen/enlighten_pv.c | 16 ++ arch/x86/xen/enlighten_pvh.c | 12 + arch/x86/xen/xenhost.c | 75 include/xen/xen.h | 3 ++ include/xen/xenhost.h | 95 7 files changed, 215 insertions(+) create mode 100644 arch/x86/xen/xenhost.c create mode 100644 include/xen/xenhost.h diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 084de77a109e..564b4dddbc15 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,6 +18,7 @@ obj-y += mmu.o obj-y += time.o obj-y += grant-table.o obj-y += suspend.o +obj-y += xenhost.o obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o obj-$(CONFIG_XEN_PVHVM) += mmu_hvm.o diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 0e75642d42a3..100452f4f44c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -82,6 +83,12 @@ static void __init xen_hvm_init_mem_mapping(void) xen_vcpu_info_reset(0); } +xenhost_ops_t xh_hvm_ops = { +}; + +xenhost_ops_t xh_hvm_nested_ops = { +}; + static void __init init_hvm_pv_info(void) { int major, minor; @@ -179,6 +186,12 @@ static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) return; + /* + * We need only xenhost_r1 for HVM guests since they cannot be + * driver domain (?) or dom0. I think even HVM guests could (in theory) be driver domains. + */ + if (!xen_pvh_domain()) + xenhost_register(xenhost_r1, &xh_hvm_ops); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index c54a493e139a..bb6e811c1525 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -1188,6 +1189,12 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +xenhost_ops_t xh_pv_ops = { +}; + +xenhost_ops_t xh_pv_nested_ops = { +}; + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1198,6 +1205,15 @@ asmlinkage __visible void __init xen_start_kernel(void) if (!xen_start_info) return; + xenhost_register(xenhost_r1, &xh_pv_ops); + + /* + * Detect in some implementation defined manner whether this is + * nested or not. + */ + if (xen_driver_domain() && xen_nested()) + xenhost_register(xenhost_r2, &xh_pv_nested_ops); I don't think a driver domain other than dom0 "knows" this in the beginning. It will need to register xenhost_r2 Right. No point in needlessly registrating as xenhost_r2 without needing to handle any xenhost_r2 devices. in case it learns about a pv device from L0 hypervisor. What's the mechanism you are thinking of, for this? I'm guessing this PV device notification could arrive at an arbitrary point in time after the system has booted. The earlier reason for my assumption that the driver-domain would "know" this at boot, was because it seemed to me that we would need to setup hypercall/shared_info/vcpu_info. Given that we don't need cpuid/hypercall/shared_info, the remaining few look like they could be made dynamically callable with a bit of refactoring: - vcpu_info: the registration logic (xen_vcpu_setup() and friends) seems straight-forwardly adaptable to be called dynamically for xenhost_r2. Places where we touch the vcpu_info bits (xen_irq_ops) also seem fine. - evtchn: xenhost_r2 should only need interdomain evtchns, so should be easy to defer to until we get a xenhost_r2 device. - grant-table/xenbus: the xenhost_r2 logic (in the current patchset) expects to be inited at core_initcall and postcore_initcall respectively. Again, doesn't + xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 35b7599d2d0b..826c296d27a3 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -8,6 +8,7 @@ #include #include +#include #include #incl
Re: [RFC PATCH 06/16] x86/xen: add shared_info support to xenhost_t
On 2019-06-07 8:08 a.m., Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: HYPERVISOR_shared_info is used for irq/evtchn communication between the guest and the host. Abstract out the setup/reset in xenhost_t such that nested configurations can use both xenhosts simultaneously. I have mixed feelings about this patch. Most of the shared_info stuff we don't need for the nested case. In the end only the event channels might be interesting, but we obviously want them not for all vcpus of the L1 hypervisor, but for those of the current guest. Agreed about the mixed feelings part. shared_info does feel far too heavy to drag along just for the event-channel state. Infact, on thinking a bit more, a better abstraction for nested event-channels would have been as an extension to the primary xenhost's event-channel bits. (The nested upcalls also go via the primary xenhost in patch-8.) Ankur So I think just drop that patch for now. We can dig it out later in case > nesting wants it again. Juergen
Re: [RFC PATCH 00/16] xenhost support
On 2019-06-07 7:51 a.m., Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Hi all, This is an RFC for xenhost support, outlined here by Juergen here: https://lkml.org/lkml/2019/4/8/67. First: thanks for all the effort you've put into this series! The high level idea is to provide an abstraction of the Xen communication interface, as a xenhost_t. xenhost_t expose ops for communication between the guest and Xen (hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on top of those, xenbus, ballooning), and these can differ based on the kind of underlying Xen: regular, local, and nested. I'm not sure we need to abstract away hypercalls and cpuid. I believe in case of nested Xen all contacts to the L0 hypervisor should be done via the L1 hypervisor. So we might need to issue some kind of passthrough Yes, that does make sense. This also allows the L1 hypervisor to control which hypercalls can be nested. As for cpuid, what about nested feature discovery such as in gnttab_need_v2()? (Though for this particular case, the hypercall should be fine.) hypercall when e.g. granting a page to L0 dom0, but this should be handled via the grant abstraction (events should be similar). So IMO we should drop patches 2-5. For 3-5, I'd like to prune them to provide a limited hypercall registration ability -- this is meant to be used for the xenhost_r0/xenhost_local case. Ankur (Since this abstraction is largely about guest -- xenhost communication, no ops are needed for timer, clock, sched, memory (MMU, P2M), VCPU mgmt. etc.) Xenhost use-cases: Regular-Xen: the standard Xen interface presented to a guest, specifically for comunication between Lx-guest and Lx-Xen. Local-Xen: a Xen like interface which runs in the same address space as the guest (dom0). This, can act as the default xenhost. The major ways it differs from a regular Xen interface is in presenting a different hypercall interface (call instead of a syscall/vmcall), and in an inability to do grant-mappings: since local-Xen exists in the same address space as Xen, there's no way for it to cheaply change the physical page that a GFN maps to (assuming no P2M tables.) Nested-Xen: this channel is to Xen, one level removed: from L1-guest to L0-Xen. The use case is that we want L0-dom0-backends to talk to L1-dom0-frontend drivers which can then present PV devices which can in-turn be used by the L1-dom0-backend drivers as raw underlying devices. The interfaces themselves, broadly remain similar. Note: L0-Xen, L1-Xen represent Xen running at that nesting level and L0-guest, L1-guest represent guests that are children of Xen at that nesting level. Lx, represents any level. Patches 1-7, "x86/xen: add xenhost_t interface" "x86/xen: cpuid support in xenhost_t" "x86/xen: make hypercall_page generic" "x86/xen: hypercall support for xenhost_t" "x86/xen: add feature support in xenhost_t" "x86/xen: add shared_info support to xenhost_t" "x86/xen: make vcpu_info part of xenhost_t" abstract out interfaces that setup hypercalls/cpuid/shared_info/vcpu_info etc. Patch 8, "x86/xen: irq/upcall handling with multiple xenhosts" sets up the upcall and pv_irq ops based on vcpu_info. Patch 9, "xen/evtchn: support evtchn in xenhost_t" adds xenhost based evtchn support for evtchn_2l. Patches 10 and 16, "xen/balloon: support ballooning in xenhost_t" and "xen/grant-table: host_addr fixup in mapping on xenhost_r0" implement support from GNTTABOP_map_grant_ref for xenhosts of type xenhost_r0 (xenhost local.) Patch 12, "xen/xenbus: support xenbus frontend/backend with xenhost_t" makes xenbus so that both its frontend and backend can be bootstrapped separately via separate xenhosts. Remaining patches, 11, 13, 14, 15: "xen/grant-table: make grant-table xenhost aware" "drivers/xen: gnttab, evtchn, xenbus API changes" "xen/blk: gnttab, evtchn, xenbus API changes" "xen/net: gnttab, evtchn, xenbus API changes" are mostly mechanical changes for APIs that now take xenhost_t * as parameter. The code itself is RFC quality, and is mostly meant to get feedback before proceeding further. Also note that the FIFO logic and some Xen drivers (input, pciback, scsi etc) are mostly unchanged, so will not build. Please take a look. Juergen
Re: [Xen-devel] [RFC PATCH 00/16] xenhost support
On 2019-06-07 9:21 a.m., Juergen Gross wrote: On 07.06.19 17:22, Joao Martins wrote: On 6/7/19 3:51 PM, Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Hi all, This is an RFC for xenhost support, outlined here by Juergen here: https://lkml.org/lkml/2019/4/8/67. First: thanks for all the effort you've put into this series! The high level idea is to provide an abstraction of the Xen communication interface, as a xenhost_t. xenhost_t expose ops for communication between the guest and Xen (hypercall, cpuid, shared_info/vcpu_info, evtchn, grant-table and on top of those, xenbus, ballooning), and these can differ based on the kind of underlying Xen: regular, local, and nested. I'm not sure we need to abstract away hypercalls and cpuid. I believe in case of nested Xen all contacts to the L0 hypervisor should be done via the L1 hypervisor. So we might need to issue some kind of passthrough hypercall when e.g. granting a page to L0 dom0, but this should be handled via the grant abstraction (events should be similar). Just to be clear: By "kind of passthrough hypercall" you mean (e.g. for every access/modify of grant table frames) you would proxy hypercall to L0 Xen via L1 Xen? It might be possible to spare some hypercalls by directly writing to grant frames mapped into L1 dom0, but in general you are right. Wouldn't we still need map/unmap_grant_ref? AFAICS, both the xenhost_direct and the xenhost_indirect cases should be very similar (apart from the need to proxy in the indirect case.) Ankur Juergen ___ Xen-devel mailing list xen-de...@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: cputime takes cstate into consideration
On 2019-06-26 12:23 p.m., Thomas Gleixner wrote: On Wed, 26 Jun 2019, Raslan, KarimAllah wrote: On Wed, 2019-06-26 at 10:54 -0400, Konrad Rzeszutek Wilk wrote: There were some ideas that Ankur (CC-ed) mentioned to me of using the perf counters (in the host) to sample the guest and construct a better accounting idea of what the guest does. That way the dashboard from the host would not show 100% CPU utilization. You can either use the UNHALTED cycles perf-counter or you can use MPERF/APERF MSRs for that. (sorry I got distracted and forgot to send the patch) Sure, but then you conflict with the other people who fight tooth and nail over every single performance counter. How about using Intel PT PwrEvt extensions? This should allow us to precisely track idle residency via just MWAIT and TSC packets. Should be pretty cheap too. It's post Cascade Lake though. Ankur Thanks, tglx
Re: cputime takes cstate into consideration
On 7/9/19 5:38 AM, Peter Zijlstra wrote: On Mon, Jul 08, 2019 at 07:00:08PM -0700, Ankur Arora wrote: On 2019-06-26 12:23 p.m., Thomas Gleixner wrote: On Wed, 26 Jun 2019, Raslan, KarimAllah wrote: On Wed, 2019-06-26 at 10:54 -0400, Konrad Rzeszutek Wilk wrote: There were some ideas that Ankur (CC-ed) mentioned to me of using the perf counters (in the host) to sample the guest and construct a better accounting idea of what the guest does. That way the dashboard from the host would not show 100% CPU utilization. You can either use the UNHALTED cycles perf-counter or you can use MPERF/APERF MSRs for that. (sorry I got distracted and forgot to send the patch) Sure, but then you conflict with the other people who fight tooth and nail over every single performance counter. How about using Intel PT PwrEvt extensions? This should allow us to precisely track idle residency via just MWAIT and TSC packets. Should be pretty cheap too. It's post Cascade Lake though. That would fully claim PT just for this stupid accounting thing and be completely Intel specific. Just stop this madness already. I see the point about just accruing guest time (in mwait or not) as guest CPU time. But, to take this madness a little further, I'm not sure I see why it fully claims PT. AFAICS, we should be able to enable PwrEvt and whatever else simultaneously. Ankur
[PATCH 1/2 v2] xen/acpi: Replace hard coded "ACPI0007"
Replace hard coded "ACPI0007" with ACPI_PROCESSOR_DEVICE_HID Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Ankur Arora --- drivers/xen/xen-acpi-processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 4ce10bc..fac0d7b 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -408,7 +408,7 @@ static int check_acpi_ids(struct acpi_processor *pr_backup) acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, read_acpi_id, NULL, NULL, NULL); - acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + acpi_get_devices(ACPI_PROCESSOR_DEVICE_HID, read_acpi_id, NULL, NULL); upload: if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { -- 2.7.4
[PATCH 0/2 v2] xen/acpi: upload PM state from init-domain to Xen
This patch series re-enables the upload of PM data from initial-domain to Xen. This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4. The upload now happens post-resume in workqueue context. From the POV of Xen, the PM upload might be delayed a little but should be fine -- Xen falls-back on more limited P and C states. Tested C-state upload via mwait_idle=0. Changes in v2: - rebased to 4.11.0-rc2 - addressed comments from Boris Ostrovsky Ankur Arora (2): xen/acpi: Replace hard coded "ACPI0007" xen/acpi: upload PM state from init-domain to Xen drivers/xen/xen-acpi-processor.c | 36 +++- 1 file changed, 27 insertions(+), 9 deletions(-) -- 2.7.4
[PATCH 2/2 v2] xen/acpi: upload PM state from init-domain to Xen
This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4. do_suspend (from xen/manage.c) and thus xen_resume_notifier never get called on the initial-domain at resume (it is if running as guest.) The rationale for the breaking change was that upload_pm_data() potentially does blocking work in syscore_resume(). This patch addresses the original issue by scheduling upload_pm_data() to execute in workqueue context. Changes in v2: - rebased to 4.11.0-rc2 - addressed comments from Boris Ostrovsky Cc: Stanislaw Gruszka Cc: sta...@vger.kernel.org Based-on-patch-by: Konrad Wilk Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Ankur Arora --- drivers/xen/xen-acpi-processor.c | 34 ++ 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index fac0d7b..23e391d 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -27,10 +27,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -466,15 +466,33 @@ static int xen_upload_processor_pm_data(void) return rc; } -static int xen_acpi_processor_resume(struct notifier_block *nb, -unsigned long action, void *data) +static void xen_acpi_processor_resume_worker(struct work_struct *dummy) { + int rc; + bitmap_zero(acpi_ids_done, nr_acpi_bits); - return xen_upload_processor_pm_data(); + + rc = xen_upload_processor_pm_data(); + if (rc != 0) + pr_info("ACPI data upload failed, error = %d\n", rc); +} + +static void xen_acpi_processor_resume(void) +{ + static DECLARE_WORK(wq, xen_acpi_processor_resume_worker); + + /* +* xen_upload_processor_pm_data() calls non-atomic code. +* However, the context for xen_acpi_processor_resume is syscore +* with only the boot CPU online and in an atomic context. +* +* So defer the upload for some point safer. +*/ + schedule_work(&wq); } -struct notifier_block xen_acpi_processor_resume_nb = { - .notifier_call = xen_acpi_processor_resume, +static struct syscore_ops xap_syscore_ops = { + .resume = xen_acpi_processor_resume, }; static int __init xen_acpi_processor_init(void) @@ -527,7 +545,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + register_syscore_ops(&xap_syscore_ops); return 0; err_unregister: @@ -544,7 +562,7 @@ static void __exit xen_acpi_processor_exit(void) { int i; - xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); + unregister_syscore_ops(&xap_syscore_ops); kfree(acpi_ids_done); kfree(acpi_id_present); kfree(acpi_id_cst_present); -- 2.7.4
Re: [PATCH 2/2 v2] xen/acpi: upload PM state from init-domain to Xen
On 2017-03-22 02:05 AM, Stanislaw Gruszka wrote: On Tue, Mar 21, 2017 at 03:43:38PM -0700, Ankur Arora wrote: This was broken in commit cd979883b9ede90643e019f33cb317933eb867b4. do_suspend (from xen/manage.c) and thus xen_resume_notifier never get called on the initial-domain at resume (it is if running as guest.) The rationale for the breaking change was that upload_pm_data() potentially does blocking work in syscore_resume(). This patch addresses the original issue by scheduling upload_pm_data() to execute in workqueue context. It is ok to do upload_pm_data() with delay i.e. after some other resume actions are done and possibly xen-acpi-processor is in running state ? The state uploaded is ACPI P and C state from struct acpi_processor which AFAICS is stable once inited so a delay would not lead to invalid state. The only concern would be the ACPI pCPU hotplug logic in acpi_processor_add() which could add a new entry in per_cpu(processors) but that also looks okay because either we get a NULL or we get a pointer to an inited structure. As for the hypervisor -- that falls back to more limited state after resume (because some of this state is thrown away at suspend) and so uses that until it gets the uploaded PM state from the initial-domain. Ankur Stanislaw
[PATCH 0/8] Use uncached writes while clearing gigantic pages
This series adds clear_page_nt(), a non-temporal MOV (MOVNTI) based clear_page(). The immediate use case is to speedup creation of large (~2TB) guests VMs. Memory for these guests is allocated via huge/gigantic pages which are faulted in early. The intent behind using non-temporal writes is to minimize allocation of unnecessary cachelines. This helps in minimizing cache pollution, and potentially also speeds up zeroing of large extents. That said there are, uncached writes are not always great, as can be seen in these 'perf bench mem memset' numbers comparing clear_page_erms() and clear_page_nt(): Intel Broadwellx: x86-64-stosb (5 runs) x86-64-movnt (5 runs) speedup --- --- --- sizeBW ( pstdev) BW ( pstdev) 16MB 17.35 GB/s ( +- 9.27%)11.83 GB/s ( +- 0.19%) -31.81% 128MB 5.31 GB/s ( +- 0.13%)11.72 GB/s ( +- 0.44%)+121.84% AMD Rome: x86-64-stosq (5 runs) x86-64-movnt (5 runs) speedup --- --- --- sizeBW ( pstdev) BW ( pstdev) 16MB 15.39 GB/s ( +- 9.14%)14.56 GB/s ( +-19.43%) -5.39% 128MB 11.04 GB/s ( +- 4.87%)14.49 GB/s ( +-13.22%)+31.25% Intel Skylakex: x86-64-stosb (5 runs) x86-64-movnt (5 runs) speedup --- ------ sizeBW ( pstdev) BW ( pstdev) 16MB 20.38 GB/s ( +- 2.58%) 6.25 GB/s ( +- 0.41%) -69.28% 128MB 6.52 GB/s ( +- 0.14%) 6.31 GB/s ( +- 0.47%)-3.22% (All of the machines in these tests had a minimum of 25MB L3 cache per socket.) There are two performance issues: - uncached writes typically perform better only for region sizes sizes around or larger than ~LLC-size. - MOVNTI does not always perform well on all microarchitectures. We handle the first issue by only using clear_page_nt() for GB pages. That leaves out page zeroing for 2MB pages, which is a size that's large enough that uncached writes might have meaningful cache benefits but at the same time is small enough that uncached writes would end up being slower. We can handle a subset of the 2MB case -- mmaps with MAP_POPULATE -- by means of a uncached-or-cached hint decided based on a threshold size. This would apply to maps backed by any page-size. This case is not handled in this series -- I wanted to sanity check the high level approach before attempting that. Handle the second issue by adding a synthetic cpu-feature, X86_FEATURE_NT_GOOD which is only enabled for architectures where MOVNTI performs well. (Relatedly, I thought I had independently decided to use ALTERNATIVES to deal with this, but more likely I had just internalized it from this discussion: https://lore.kernel.org/linux-mm/20200316101856.gh11...@dhcp22.suse.cz/#t) Accordingly this series enables X86_FEATURE_NT_GOOD for Intel Broadwellx and AMD Rome. (In my testing, the performance was also good for some pre-production models but this series leaves them out.) Please review. Thanks Ankur Ankur Arora (8): x86/cpuid: add X86_FEATURE_NT_GOOD x86/asm: add memset_movnti() perf bench: add memset_movnti() x86/asm: add clear_page_nt() x86/clear_page: add clear_page_uncached() mm, clear_huge_page: use clear_page_uncached() for gigantic pages x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx x86/cpu/amd: enable X86_FEATURE_NT_GOOD on AMD Zen arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/page.h | 6 +++ arch/x86/include/asm/page_32.h | 9 arch/x86/include/asm/page_64.h | 15 ++ arch/x86/kernel/cpu/amd.c| 3 ++ arch/x86/kernel/cpu/intel.c | 2 + arch/x86/lib/clear_page_64.S | 26 +++ arch/x86/lib/memset_64.S | 68 include/asm-generic/page.h | 3 ++ include/linux/highmem.h | 10 mm/memory.c | 3 +- tools/arch/x86/lib/memset_64.S | 68 tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 ++- 13 files changed, 158 insertions(+), 62 deletions(-) -- 2.9.3
[PATCH 2/8] x86/asm: add memset_movnti()
Add a MOVNTI based implementation of memset(). memset_orig() and memset_movnti() only differ in the opcode used in the inner loop, so move the memset_orig() logic into a macro, which gets expanded into memset_movq() and memset_movnti(). Signed-off-by: Ankur Arora --- arch/x86/lib/memset_64.S | 68 +++- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9ff15ee404a4..79703cc04b6a 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -27,7 +27,7 @@ SYM_FUNC_START(__memset) * * Otherwise, use original memset function. */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \ "jmp memset_erms", X86_FEATURE_ERMS movq %rdi,%r9 @@ -68,7 +68,8 @@ SYM_FUNC_START_LOCAL(memset_erms) ret SYM_FUNC_END(memset_erms) -SYM_FUNC_START_LOCAL(memset_orig) +.macro MEMSET_MOV OP fence +SYM_FUNC_START_LOCAL(memset_\OP) movq %rdi,%r10 /* expand byte value */ @@ -79,64 +80,71 @@ SYM_FUNC_START_LOCAL(memset_orig) /* align dst */ movl %edi,%r9d andl $7,%r9d - jnz .Lbad_alignment -.Lafter_bad_alignment: + jnz .Lbad_alignment_\@ +.Lafter_bad_alignment_\@: movq %rdx,%rcx shrq $6,%rcx - jz .Lhandle_tail + jz .Lhandle_tail_\@ .p2align 4 -.Lloop_64: +.Lloop_64_\@: decq %rcx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) + \OP %rax,(%rdi) + \OP %rax,8(%rdi) + \OP %rax,16(%rdi) + \OP %rax,24(%rdi) + \OP %rax,32(%rdi) + \OP %rax,40(%rdi) + \OP %rax,48(%rdi) + \OP %rax,56(%rdi) leaq 64(%rdi),%rdi - jnz.Lloop_64 + jnz.Lloop_64_\@ /* Handle tail in loops. The loops should be faster than hard to predict jump tables. */ .p2align 4 -.Lhandle_tail: +.Lhandle_tail_\@: movl%edx,%ecx andl$63&(~7),%ecx - jz .Lhandle_7 + jz .Lhandle_7_\@ shrl$3,%ecx .p2align 4 -.Lloop_8: +.Lloop_8_\@: decl %ecx - movq %rax,(%rdi) + \OP %rax,(%rdi) leaq 8(%rdi),%rdi - jnz.Lloop_8 + jnz.Lloop_8_\@ -.Lhandle_7: +.Lhandle_7_\@: andl$7,%edx - jz .Lende + jz .Lende_\@ .p2align 4 -.Lloop_1: +.Lloop_1_\@: decl%edx movb%al,(%rdi) leaq1(%rdi),%rdi - jnz .Lloop_1 + jnz .Lloop_1_\@ -.Lende: +.Lende_\@: + .if \fence + sfence + .endif movq%r10,%rax ret -.Lbad_alignment: +.Lbad_alignment_\@: cmpq $7,%rdx - jbe .Lhandle_7 + jbe .Lhandle_7_\@ movq %rax,(%rdi)/* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi subq %r8,%rdx - jmp .Lafter_bad_alignment -.Lfinal: -SYM_FUNC_END(memset_orig) + jmp .Lafter_bad_alignment_\@ +.Lfinal_\@: +SYM_FUNC_END(memset_\OP) +.endm + +MEMSET_MOV OP=movq fence=0 +MEMSET_MOV OP=movnti fence=1 -- 2.9.3
[PATCH 5/8] x86/clear_page: add clear_page_uncached()
Define clear_page_uncached() as an alternative_call() to clear_page_nt() if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it doesn't. Similarly define clear_page_uncached_flush() which provides an SFENCE if the CPU sets X86_FEATURE_NT_GOOD. Also, add the glue interface clear_user_highpage_uncached(). Signed-off-by: Ankur Arora --- arch/x86/include/asm/page.h| 6 ++ arch/x86/include/asm/page_32.h | 9 + arch/x86/include/asm/page_64.h | 14 ++ include/asm-generic/page.h | 3 +++ include/linux/highmem.h| 10 ++ 5 files changed, 42 insertions(+) diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 7555b48803a8..ca0aa379ac7f 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -28,6 +28,12 @@ static inline void clear_user_page(void *page, unsigned long vaddr, clear_page(page); } +static inline void clear_user_page_uncached(void *page, unsigned long vaddr, + struct page *pg) +{ + clear_page_uncached(page); +} + static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 94dbd51df58f..7a03a274a9a4 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -39,6 +39,15 @@ static inline void clear_page(void *page) memset(page, 0, PAGE_SIZE); } +static inline void clear_page_uncached(void *page) +{ + clear_page(page); +} + +static inline void clear_page_uncached_flush(void) +{ +} + static inline void copy_page(void *to, void *from) { memcpy(to, from, PAGE_SIZE); diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index bde3c2785ec4..5897075e77dd 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -55,6 +55,20 @@ static inline void clear_page(void *page) : "cc", "memory", "rax", "rcx"); } +static inline void clear_page_uncached(void *page) +{ + alternative_call(clear_page, +clear_page_nt, X86_FEATURE_NT_GOOD, +"=D" (page), +"0" (page) +: "cc", "memory", "rax", "rcx"); +} + +static inline void clear_page_uncached_flush(void) +{ + alternative("", "sfence", X86_FEATURE_NT_GOOD); +} + void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index fe801f01625e..60235a0cf24a 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -26,6 +26,9 @@ #ifndef __ASSEMBLY__ #define clear_page(page) memset((page), 0, PAGE_SIZE) +#define clear_page_uncached(page) clear_page(page) +#define clear_page_uncached_flush()do { } while (0) + #define copy_page(to,from) memcpy((to), (from), PAGE_SIZE) #define clear_user_page(page, vaddr, pg) clear_page(page) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 14e6202ce47f..f842593e2474 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -232,6 +232,16 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif +#ifndef clear_user_highpage_uncached +static inline void clear_user_highpage_uncached(struct page *page, unsigned long vaddr) +{ + void *addr = kmap_atomic(page); + + clear_user_page_uncached(addr, vaddr, page); + kunmap_atomic(addr); +} +#endif + #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /** * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags -- 2.9.3
[PATCH 3/8] perf bench: add memset_movnti()
Clone memset_movnti() from arch/x86/lib/memset_64.S. perf bench mem memset on -f x86-64-movnt on Intel Broadwellx, Skylakex and AMD Rome: Intel Broadwellx: $ for i in 2 8 32 128 512; do perf bench mem memset -f x86-64-movnt -s ${i}MB done # Output pruned. # Running 'mem/memset' benchmark: # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S) # Copying 2MB bytes ... 11.837121 GB/sec # Copying 8MB bytes ... 11.783560 GB/sec # Copying 32MB bytes ... 11.868591 GB/sec # Copying 128MB bytes ... 11.865211 GB/sec # Copying 512MB bytes ... 11.864085 GB/sec Intel Skylakex: $ for i in 2 8 32 128 512; do perf bench mem memset -f x86-64-movnt -s ${i}MB done # Running 'mem/memset' benchmark: # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S) # Copying 2MB bytes ... 6.361971 GB/sec # Copying 8MB bytes ... 6.300403 GB/sec # Copying 32MB bytes ... 6.288992 GB/sec # Copying 128MB bytes ... 6.328793 GB/sec # Copying 512MB bytes ... 6.324471 GB/sec AMD Rome: $ for i in 2 8 32 128 512; do perf bench mem memset -f x86-64-movnt -s ${i}MB done # Running 'mem/memset' benchmark: # function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S) # Copying 2MB bytes ... 10.993199 GB/sec # Copying 8MB bytes ... 14.221784 GB/sec # Copying 32MB bytes ... 14.293337 GB/sec # Copying 128MB bytes ... 15.238947 GB/sec # Copying 512MB bytes ... 16.476093 GB/sec Signed-off-by: Ankur Arora --- tools/arch/x86/lib/memset_64.S | 68 tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 ++- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S index fd5d25a474b7..bfbf6d06f81e 100644 --- a/tools/arch/x86/lib/memset_64.S +++ b/tools/arch/x86/lib/memset_64.S @@ -26,7 +26,7 @@ SYM_FUNC_START(__memset) * * Otherwise, use original memset function. */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \ "jmp memset_erms", X86_FEATURE_ERMS movq %rdi,%r9 @@ -65,7 +65,8 @@ SYM_FUNC_START(memset_erms) ret SYM_FUNC_END(memset_erms) -SYM_FUNC_START(memset_orig) +.macro MEMSET_MOV OP fence +SYM_FUNC_START(memset_\OP) movq %rdi,%r10 /* expand byte value */ @@ -76,64 +77,71 @@ SYM_FUNC_START(memset_orig) /* align dst */ movl %edi,%r9d andl $7,%r9d - jnz .Lbad_alignment -.Lafter_bad_alignment: + jnz .Lbad_alignment_\@ +.Lafter_bad_alignment_\@: movq %rdx,%rcx shrq $6,%rcx - jz .Lhandle_tail + jz .Lhandle_tail_\@ .p2align 4 -.Lloop_64: +.Lloop_64_\@: decq %rcx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) + \OP %rax,(%rdi) + \OP %rax,8(%rdi) + \OP %rax,16(%rdi) + \OP %rax,24(%rdi) + \OP %rax,32(%rdi) + \OP %rax,40(%rdi) + \OP %rax,48(%rdi) + \OP %rax,56(%rdi) leaq 64(%rdi),%rdi - jnz.Lloop_64 + jnz.Lloop_64_\@ /* Handle tail in loops. The loops should be faster than hard to predict jump tables. */ .p2align 4 -.Lhandle_tail: +.Lhandle_tail_\@: movl%edx,%ecx andl$63&(~7),%ecx - jz .Lhandle_7 + jz .Lhandle_7_\@ shrl$3,%ecx .p2align 4 -.Lloop_8: +.Lloop_8_\@: decl %ecx - movq %rax,(%rdi) + \OP %rax,(%rdi) leaq 8(%rdi),%rdi - jnz.Lloop_8 + jnz.Lloop_8_\@ -.Lhandle_7: +.Lhandle_7_\@: andl$7,%edx - jz .Lende + jz .Lende_\@ .p2align 4 -.Lloop_1: +.Lloop_1_\@: decl%edx movb%al,(%rdi) leaq1(%rdi),%rdi - jnz .Lloop_1 + jnz .Lloop_1_\@ -.Lende: +.Lende_\@: + .if \fence + sfence + .endif movq%r10,%rax ret -.Lbad_alignment: +.Lbad_alignment_\@: cmpq $7,%rdx - jbe .Lhandle_7 + jbe .Lhandle_7_\@ movq %rax,(%rdi)/* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi subq %r8,%rdx - jmp .Lafter_bad_alignment -.Lfinal: -SYM_FUNC_END(memset_orig) + jmp .Lafter_bad_alignment_\@ +.Lfinal_\@: +SYM_FUNC_END(memset_\OP) +.endm + +MEMSET_MOV OP=movq fence=0 +MEMSET_MOV OP=movnti fence
[PATCH 4/8] x86/asm: add clear_page_nt()
( +- 1.06% ) (23.52%) 1,966 LLC-load-misses #1.97% of all LL-cache accesses ( +- 6.17% ) (23.52%) 129 LLC-stores#0.038 K/sec ( +- 27.85% ) (11.75%) 7 LLC-store-misses #0.002 K/sec ( +- 47.82% ) (11.75%) 3.37465 +- 0.00474 seconds time elapsed ( +- 0.14% ) The L1-dcache-load-misses (L1D.REPLACEMENT) count is much lower just like the previous two cases. No performance improvement for Skylakex though. Signed-off-by: Ankur Arora --- arch/x86/include/asm/page_64.h | 1 + arch/x86/lib/clear_page_64.S | 26 ++ 2 files changed, 27 insertions(+) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 939b1cff4a7b..bde3c2785ec4 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -43,6 +43,7 @@ extern unsigned long __phys_addr_symbol(unsigned long); void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); +void clear_page_nt(void *page); static inline void clear_page(void *page) { diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index c4c7dd115953..f16bb753b236 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -50,3 +50,29 @@ SYM_FUNC_START(clear_page_erms) ret SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) + +/* + * Zero a page. + * %rdi - page + * + * Caller needs to issue a fence at the end. + */ +SYM_FUNC_START(clear_page_nt) + xorl%eax,%eax + movl$4096,%ecx + + .p2align 4 +.Lstart: +movnti %rax, 0x00(%rdi) +movnti %rax, 0x08(%rdi) +movnti %rax, 0x10(%rdi) +movnti %rax, 0x18(%rdi) +movnti %rax, 0x20(%rdi) +movnti %rax, 0x28(%rdi) +movnti %rax, 0x30(%rdi) +movnti %rax, 0x38(%rdi) +addq$0x40, %rdi +subl$0x40, %ecx +ja .Lstart + ret +SYM_FUNC_END(clear_page_nt) -- 2.9.3
[PATCH 7/8] x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx
468,643 bus-cycles# 99.276 M/sec ( +- 0.01% ) (23.53%) 717,228 L1-dcache-load-misses #0.20% of all L1-dcache accesses ( +- 3.77% ) (23.53%) 351,999,535 L1-dcache-loads # 32.403 M/sec ( +- 0.04% ) (23.53%) 75,988 LLC-loads #0.007 M/sec ( +- 4.20% ) (23.53%) 24,503 LLC-load-misses # 32.25% of all LL-cache accesses ( +- 53.30% ) (23.53%) 57,283 LLC-stores#0.005 M/sec ( +- 2.15% ) (11.76%) 19,738 LLC-store-misses #0.002 M/sec ( +- 46.55% ) (11.76%) 351,836,498 dTLB-loads# 32.388 M/sec ( +- 0.04% ) (17.65%) 1,171 dTLB-load-misses #0.00% of all dTLB cache accesses ( +- 42.68% ) (23.53%) 17,385,579,725 dTLB-stores # 1600.392 M/sec ( +- 0.00% ) (23.53%) 200 dTLB-store-misses #0.018 K/sec ( +- 10.63% ) (23.53%) 10.863678 +- 0.000804 seconds time elapsed ( +- 0.01% ) L1-dcache-load-misses (L1D.REPLACEMENT) is substantially lower which suggests that, as expected, we aren't doing write-allocate or RFO. Note that the IPC and instruction counts etc are quite different, but that's just an artifact of switching from a single 'REP; STOSB' per PAGE_SIZE region to a MOVNTI loop. The page-clearing BW is substantially higher (~100% or more), so enable X86_FEATURE_NT_GOOD for Intel Broadwellx. Signed-off-by: Ankur Arora --- arch/x86/kernel/cpu/intel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 59a1e3ce3f14..161028c1dee0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -662,6 +662,8 @@ static void init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) + set_cpu_cap(c, X86_FEATURE_NT_GOOD); #else /* * Names for the Pentium II/Celeron processors -- 2.9.3
[PATCH 1/8] x86/cpuid: add X86_FEATURE_NT_GOOD
Enabled on microarchitectures with performant non-temporal MOV (MOVNTI) instruction. Signed-off-by: Ankur Arora --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7b0afd5e6c57..8bae38240346 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -289,6 +289,7 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL(11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_NT_GOOD(11*32+ 8) /* Non-temporal instructions perform well */ /* Intel-defined CPU features, CPUID level 0x0007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16(12*32+ 5) /* AVX512 BFLOAT16 instructions */ -- 2.9.3
[PATCH 8/8] x86/cpu/amd: enable X86_FEATURE_NT_GOOD on AMD Zen
substantially lower which suggests we aren't doing write-allocate or RFO. The L1-dcache-prefetches are also substantially lower. Note that the IPC and instruction counts etc are quite different, but that's just an artifact of switching from a single 'REP; STOSQ' per PAGE_SIZE region to a MOVNTI loop. The page-clearing BW shows a ~40% improvement. Additionally, a quick 'perf bench memset' comparison on AMD Naples (AMD EPYC 7551) shows similar performance gains. So, enable X86_FEATURE_NT_GOOD for AMD Zen. Signed-off-by: Ankur Arora --- arch/x86/kernel/cpu/amd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcc3d943c68f..c57eb6c28aa1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -918,6 +918,9 @@ static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); + if (c->x86 == 0x17) + set_cpu_cap(c, X86_FEATURE_NT_GOOD); + #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif -- 2.9.3
[PATCH 6/8] mm, clear_huge_page: use clear_page_uncached() for gigantic pages
Uncached writes are suitable for circumstances where the region written to is not expected to be read again soon, or the region written to is large enough that there's no expectation that we will find the writes in the cache. Accordingly switch to using clear_page_uncached() for gigantic pages. Signed-off-by: Ankur Arora --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index eeae590e526a..4d2c58f83ab1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5092,7 +5092,7 @@ static void clear_gigantic_page(struct page *page, for (i = 0; i < pages_per_huge_page; i++, p = mem_map_next(p, page, i)) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + clear_user_highpage_uncached(p, addr + i * PAGE_SIZE); } } @@ -5111,6 +5111,7 @@ void clear_huge_page(struct page *page, if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); + clear_page_uncached_flush(); return; } -- 2.9.3
Re: [PATCH 6/8] mm, clear_huge_page: use clear_page_uncached() for gigantic pages
On 2020-10-14 8:28 a.m., Ingo Molnar wrote: * Ankur Arora wrote: Uncached writes are suitable for circumstances where the region written to is not expected to be read again soon, or the region written to is large enough that there's no expectation that we will find the writes in the cache. Accordingly switch to using clear_page_uncached() for gigantic pages. Signed-off-by: Ankur Arora --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index eeae590e526a..4d2c58f83ab1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5092,7 +5092,7 @@ static void clear_gigantic_page(struct page *page, for (i = 0; i < pages_per_huge_page; i++, p = mem_map_next(p, page, i)) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + clear_user_highpage_uncached(p, addr + i * PAGE_SIZE); } } So this does the clearing in 4K chunks, and your measurements suggest that short memory clearing is not as efficient, right? I did not measure that separately (though I should), but the performance numbers around that were somewhat puzzling. For MOVNTI, the performance via perf bench (single call to memset_movnti()) is pretty close (within margin of error) to what we see with the page-fault workload (4K chunks in clear_page_nt().) With 'REP;STOS' though, there's degradation (~30% Broadwell, ~5% Rome) between perf bench (single call to memset_erms()) compared to the page-fault workload (4K chunks in clear_page_erms()). In the second case, we are executing a lot more 'REP;STOS' loops while the number of instructions in the first case is pretty much the same, so maybe that's what accounts for it. But I checked and we are not frontend bound. Maybe there are high setup costs for 'REP;STOS' on Broadwell? It does advertise X86_FEATURE_ERMS though... I'm wondering whether it would make sense to do 2MB chunked clearing on 64-bit CPUs, instead of 512x 4k clearing? Both 2MB and GB pages are continuous in memory, so accessible to these instructions in a single narrow loop. Yeah, I think it makes sense to do and should be quite straight-forward as well. I'll try that out. I suspect it might help the X86_FEATURE_NT_BAD models more but there's no reason why for it to hurt anywhere. Ankur Thanks, Ingo
Re: [PATCH 7/8] x86/cpu/intel: enable X86_FEATURE_NT_GOOD on Intel Broadwellx
On 2020-10-14 8:31 a.m., Ingo Molnar wrote: * Ankur Arora wrote: System: Oracle X6-2 CPU: 2 nodes * 10 cores/node * 2 threads/core Intel Xeon E5-2630 v4 (Broadwellx, 6:79:1) Memory: 256 GB evenly split between nodes Microcode:0xb2e scaling_governor: performance L3 size: 25MB intel_pstate/no_turbo: 1 Performance comparison of 'perf bench mem memset -l 1' for x86-64-stosb (X86_FEATURE_ERMS) and x86-64-movnt (X86_FEATURE_NT_GOOD): x86-64-stosb (5 runs) x86-64-movnt (5 runs) speedup --- --- --- size BW( pstdev) BW ( pstdev) 16MB 17.35 GB/s ( +- 9.27%)11.83 GB/s ( +- 0.19%) -31.81% 128MB 5.31 GB/s ( +- 0.13%)11.72 GB/s ( +- 0.44%)+121.84% 1024MB 5.42 GB/s ( +- 0.13%)11.78 GB/s ( +- 0.03%)+117.34% 4096MB 5.41 GB/s ( +- 0.41%)11.76 GB/s ( +- 0.07%)+117.37% + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) + set_cpu_cap(c, X86_FEATURE_NT_GOOD); So while I agree with how you've done careful measurements to isolate bad microarchitectures where non-temporal stores are slow, I do think this approach of opt-in doesn't scale and is hard to maintain. Instead I'd suggest enabling this by default everywhere, and creating a X86_FEATURE_NT_BAD quirk table for the bad microarchitectures. Okay, some kind of quirk table is a great idea. Also means that there's a single place for keeping this rather than it being scattered all over in the code. That also simplifies my handling of features like X86_FEATURE_CLZERO. I was concerned that if you squint a bit, it seems to be an alias to X86_FEATURE_NT_GOOD and that seemed ugly. This means that with new microarchitectures we'd get automatic enablement, and hopefully chip testing would identify cases where performance isn't as good. Makes sense to me. A first class citizen, as it were... Thanks for reviewing btw. Ankur I.e. the 'trust but verify' method. Thanks, Ingo
Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()
On 2020-10-14 8:45 a.m., Andy Lutomirski wrote: On Wed, Oct 14, 2020 at 1:33 AM Ankur Arora wrote: Define clear_page_uncached() as an alternative_call() to clear_page_nt() if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it doesn't. Similarly define clear_page_uncached_flush() which provides an SFENCE if the CPU sets X86_FEATURE_NT_GOOD. As long as you keep "NT" or "MOVNTI" in the names and keep functions in arch/x86, I think it's reasonable to expect that callers understand that MOVNTI has bizarre memory ordering rules. But once you give something a generic name like "clear_page_uncached" and stick it in generic code, I think the semantics should be more obvious. How about: clear_page_uncached_unordered() or clear_page_uncached_incoherent() and flush_after_clear_page_uncached() After all, a naive reader might expect "uncached" to imply "caches are off and this is coherent with everything". And the results of getting this wrong will be subtle and possibly hard-to-reproduce corruption. Yeah, these are a lot more obvious. Thanks. Will fix. Ankur --Andy
Re: [PATCH 4/8] x86/asm: add clear_page_nt()
On 2020-10-14 12:56 p.m., Borislav Petkov wrote: On Wed, Oct 14, 2020 at 01:32:55AM -0700, Ankur Arora wrote: This can potentially improve page-clearing bandwidth (see below for performance numbers for two microarchitectures where it helps and one where it doesn't) and can help indirectly by consuming less cache resources. Any performance benefits are expected for extents larger than LLC-sized or more -- when we are DRAM-BW constrained rather than cache-BW constrained. "potentially", "expected", I don't like those formulations. That's fair. The reason for those weasel words is mostly because it is microarchitecture specific. For example on Intel where I did compare across generations: I see good performance on Broadwellx, not good on Skylakex and then good again on some pre-production CPUs. Do you have some actual benchmark data where this shows any improvement and not microbenchmarks only, to warrant the additional complexity? Yes, guest creation under QEMU (pinned guests) shows similar improvements. I've posted performance numbers in patches 7, 8 with a simple page-fault test derived from that. I can add numbers from QEMU as well. Thanks, Ankur
Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()
On 2020-10-14 2:07 p.m., Andy Lutomirski wrote: On Oct 14, 2020, at 12:58 PM, Borislav Petkov wrote: On Wed, Oct 14, 2020 at 08:45:37AM -0700, Andy Lutomirski wrote: On Wed, Oct 14, 2020 at 1:33 AM Ankur Arora wrote: Define clear_page_uncached() as an alternative_call() to clear_page_nt() if the CPU sets X86_FEATURE_NT_GOOD and fallback to clear_page() if it doesn't. Similarly define clear_page_uncached_flush() which provides an SFENCE if the CPU sets X86_FEATURE_NT_GOOD. As long as you keep "NT" or "MOVNTI" in the names and keep functions in arch/x86, I think it's reasonable to expect that callers understand that MOVNTI has bizarre memory ordering rules. But once you give something a generic name like "clear_page_uncached" and stick it in generic code, I think the semantics should be more obvious. Why does it have to be a separate call? Why isn't it behind the clear_page() alternative machinery so that the proper function is selected at boot? IOW, why does a user of clear_page functionality need to know at all about an "uncached" variant? I assume it’s for a little optimization of clearing more than one page per SFENCE. In any event, based on the benchmark data upthread, we only want to do NT clears when they’re rather large, so this shouldn’t be just an alternative. I assume this is because a page or two will fit in cache and, for most uses that allocate zeroed pages, we prefer cache-hot pages. When clearing 1G, on the other hand, cache-hot is impossible and we prefer the improved bandwidth and less cache trashing of NT clears. Also, if we did extend clear_page() to take the page-size as parameter we still might not have enough information (ex. a 4K or a 2MB page that clear_page() sees could be part of a GUP of a much larger extent) to decide whether to go uncached or not. Perhaps SFENCE is so fast that this is a silly optimization, though, and we don’t lose anything measurable by SFENCEing once per page. Alas, no. I tried that and dropped about 15% performance on Rome. Thanks Ankur
Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()
On 2020-10-14 2:12 p.m., Borislav Petkov wrote: On Wed, Oct 14, 2020 at 02:07:30PM -0700, Andy Lutomirski wrote: I assume it’s for a little optimization of clearing more than one page per SFENCE. In any event, based on the benchmark data upthread, we only want to do NT clears when they’re rather large, so this shouldn’t be just an alternative. I assume this is because a page or two will fit in cache and, for most uses that allocate zeroed pages, we prefer cache-hot pages. When clearing 1G, on the other hand, cache-hot is impossible and we prefer the improved bandwidth and less cache trashing of NT clears. Yeah, use case makes sense but people won't know what to use. At the time I was experimenting with this crap, I remember Linus saying that that selection should be made based on the size of the area cleared, so users should not have to know the difference. I don't disagree but I think the selection of cached/uncached route should be made where we have enough context available to be able to choose to do this. This could be for example, done in mm_populate() or gup where if say the extent is larger than LLC-size, it takes the uncached path. Which perhaps is the only sane use case I see for this. Perhaps SFENCE is so fast that this is a silly optimization, though, and we don’t lose anything measurable by SFENCEing once per page. Yes, I'd like to see real use cases showing improvement from this, not just microbenchmarks. Sure will add. Thanks Ankur
Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()
On 2020-10-15 3:35 a.m., Borislav Petkov wrote: On Wed, Oct 14, 2020 at 08:37:44PM -0700, Ankur Arora wrote: I don't disagree but I think the selection of cached/uncached route should be made where we have enough context available to be able to choose to do this. This could be for example, done in mm_populate() or gup where if say the extent is larger than LLC-size, it takes the uncached path. Are there examples where we don't know the size? The case I was thinking of was that clear_huge_page() or faultin_page() would know the size to a page unit, while the higher level function would know the whole extent and could optimize differently based on that. Thanks Ankur
Re: [PATCH 5/8] x86/clear_page: add clear_page_uncached()
On 2020-10-15 3:40 a.m., Borislav Petkov wrote: On Wed, Oct 14, 2020 at 08:21:57PM -0700, Ankur Arora wrote: Also, if we did extend clear_page() to take the page-size as parameter we still might not have enough information (ex. a 4K or a 2MB page that clear_page() sees could be part of a GUP of a much larger extent) to decide whether to go uncached or not. clear_page* assumes 4K. All of the lowlevel asm variants do. So adding the size there won't bring you a whole lot. So you'd need to devise this whole thing differently. Perhaps have a clear_pages() helper which decides based on size what to do: uncached clearing or the clear_page() as is now in a loop. I think that'll work well for GB pages, where the clear_pages() helper has enough information to make a decision. But, unless I'm missing something, I'm not sure how that would work for say, a 1GB mm_populate() using 4K pages. The clear_page() (or clear_pages()) in that case would only see the 4K size. But let me think about this more (and look at the callsites as you suggest.) Looking at the callsites would give you a better idea I'd say. Thanks, yeah that's a good idea. Let me go do that. Ankur Thx.
Re: [PATCH] sched: introduce configurable delay before entering idle
On 5/14/19 6:50 AM, Marcelo Tosatti wrote: On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote: On Wed, 8 May 2019 at 02:57, Marcelo Tosatti wrote: Certain workloads perform poorly on KVM compared to baremetal due to baremetal's ability to perform mwait on NEED_RESCHED bit of task flags (therefore skipping the IPI). KVM supports expose mwait to the guest, if it can solve this? Regards, Wanpeng Li Unfortunately mwait in guest is not feasible (uncompatible with multiple guests). Checking whether a paravirt solution is possible. Hi Marcelo, I was also looking at making MWAIT available to guests in a safe manner: whether through emulation or a PV-MWAIT. My (unsolicited) thoughts follow. We basically want to handle this sequence: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); Emulation seems problematic because, AFAICS this would happen: guest hypervisor = monitor(monitor_address); vmexit ===>monitor(monitor_address) if (*monitor_address == base_value) mwait(); vmexit> mwait() There's a context switch back to the guest in this sequence which seems problematic. Both the AMD and Intel specs list system calls and far calls as events which would lead to the MWAIT being woken up: "Voluntary transitions due to fast system call and far calls (occurring prior to issuing MWAIT but after setting the monitor)". We could do this instead: guest hypervisor = monitor(monitor_address); vmexit ===>cache monitor_address if (*monitor_address == base_value) mwait(); vmexit> monitor(monitor_address) mwait() But, this would miss the "if (*monitor_address == base_value)" check in the host which is problematic if *monitor_address changed simultaneously when monitor was executed. (Similar problem if we cache both the monitor_address and *monitor_address.) So, AFAICS, the only thing that would work is the guest offloading the whole PV-MWAIT operation. AFAICS, that could be a paravirt operation which needs three parameters: (monitor_address, base_value, max_delay.) This would allow the guest to offload this whole operation to the host: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); I'm guessing you are thinking on similar lines? High level semantics: If the CPU doesn't have any runnable threads, then we actually do this version of PV-MWAIT -- arming a timer if necessary so we only sleep until the time-slice expires or the MWAIT max_delay does. If the CPU has any runnable threads then this could still finish its time-quanta or we could just do a schedule-out. So the semantics guaranteed to the host would be that PV-MWAIT returns after >= max_delay OR with the *monitor_address changed. Ankur
Re: [PATCH] sched: introduce configurable delay before entering idle
On 2019-05-15 6:07 p.m., Wanpeng Li wrote: On Thu, 16 May 2019 at 02:42, Ankur Arora wrote: On 5/14/19 6:50 AM, Marcelo Tosatti wrote: On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote: On Wed, 8 May 2019 at 02:57, Marcelo Tosatti wrote: Certain workloads perform poorly on KVM compared to baremetal due to baremetal's ability to perform mwait on NEED_RESCHED bit of task flags (therefore skipping the IPI). KVM supports expose mwait to the guest, if it can solve this? Regards, Wanpeng Li Unfortunately mwait in guest is not feasible (uncompatible with multiple guests). Checking whether a paravirt solution is possible. Hi Marcelo, I was also looking at making MWAIT available to guests in a safe manner: whether through emulation or a PV-MWAIT. My (unsolicited) thoughts MWAIT emulation is not simple, here is a research https://www.contrib.andrew.cmu.edu/~somlo/OSXKVM/mwait.html Agreed. I had outlined my attempt to do that below and come to the conclusion that we would need a PV-MWAIT. Ankur Regards, Wanpeng Li follow. We basically want to handle this sequence: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); Emulation seems problematic because, AFAICS this would happen: guest hypervisor = monitor(monitor_address); vmexit ===>monitor(monitor_address) if (*monitor_address == base_value) mwait(); vmexit> mwait() There's a context switch back to the guest in this sequence which seems problematic. Both the AMD and Intel specs list system calls and far calls as events which would lead to the MWAIT being woken up: "Voluntary transitions due to fast system call and far calls (occurring prior to issuing MWAIT but after setting the monitor)". We could do this instead: guest hypervisor = monitor(monitor_address); vmexit ===>cache monitor_address if (*monitor_address == base_value) mwait(); vmexit> monitor(monitor_address) mwait() But, this would miss the "if (*monitor_address == base_value)" check in the host which is problematic if *monitor_address changed simultaneously when monitor was executed. (Similar problem if we cache both the monitor_address and *monitor_address.) So, AFAICS, the only thing that would work is the guest offloading the whole PV-MWAIT operation. AFAICS, that could be a paravirt operation which needs three parameters: (monitor_address, base_value, max_delay.) This would allow the guest to offload this whole operation to the host: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); I'm guessing you are thinking on similar lines? High level semantics: If the CPU doesn't have any runnable threads, then we actually do this version of PV-MWAIT -- arming a timer if necessary so we only sleep until the time-slice expires or the MWAIT max_delay does. If the CPU has any runnable threads then this could still finish its time-quanta or we could just do a schedule-out. So the semantics guaranteed to the host would be that PV-MWAIT returns after >= max_delay OR with the *monitor_address changed. Ankur
Re: [PATCH] sched: introduce configurable delay before entering idle
On 2019-05-15 1:43 p.m., Marcelo Tosatti wrote: On Wed, May 15, 2019 at 11:42:56AM -0700, Ankur Arora wrote: On 5/14/19 6:50 AM, Marcelo Tosatti wrote: On Mon, May 13, 2019 at 05:20:37PM +0800, Wanpeng Li wrote: On Wed, 8 May 2019 at 02:57, Marcelo Tosatti wrote: Certain workloads perform poorly on KVM compared to baremetal due to baremetal's ability to perform mwait on NEED_RESCHED bit of task flags (therefore skipping the IPI). KVM supports expose mwait to the guest, if it can solve this? Regards, Wanpeng Li Unfortunately mwait in guest is not feasible (uncompatible with multiple guests). Checking whether a paravirt solution is possible. Hi Ankur, Hi Marcelo, I was also looking at making MWAIT available to guests in a safe manner: whether through emulation or a PV-MWAIT. My (unsolicited) thoughts What use-case are you interested in? Currently Oracle does not make MWAIT available to guests in cloud environments. My interest is 1) allow guests to avoid the IPI and 2) allow the waiting to be in deeper C-states so that other cores could get the benefit of turbo-boost etc. We basically want to handle this sequence: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); Emulation seems problematic because, AFAICS this would happen: guest hypervisor = monitor(monitor_address); vmexit ===>monitor(monitor_address) if (*monitor_address == base_value) mwait(); vmexit> mwait() There's a context switch back to the guest in this sequence which seems problematic. Both the AMD and Intel specs list system calls and far calls as events which would lead to the MWAIT being woken up: "Voluntary transitions due to fast system call and far calls (occurring prior to issuing MWAIT but after setting the monitor)". We could do this instead: guest hypervisor = monitor(monitor_address); vmexit ===>cache monitor_address if (*monitor_address == base_value) mwait(); vmexit> monitor(monitor_address) mwait() But, this would miss the "if (*monitor_address == base_value)" check in the host which is problematic if *monitor_address changed simultaneously when monitor was executed. (Similar problem if we cache both the monitor_address and *monitor_address.) So, AFAICS, the only thing that would work is the guest offloading the whole PV-MWAIT operation. AFAICS, that could be a paravirt operation which needs three parameters: (monitor_address, base_value, max_delay.) This would allow the guest to offload this whole operation to the host: monitor(monitor_address); if (*monitor_address == base_value) mwaitx(max_delay); I'm guessing you are thinking on similar lines? Sort of: only trying to avoid the IPI to wake a remote vCPU. Problem is that MWAIT works only on a contiguous range of bits in memory (512 bits max on current CPUs). So if you execute mwait on the host on behalf of the guest, the region of memory monitored must include both host and guest bits. Yeah, an MWAITv would have come pretty handy here ;). My idea of PV-MWAIT didn't include waiting on behalf of the host. I was thinking of waiting in the host but exclusively on behalf of the guest, until the guest is woken up or when it's time-quanta expires. Waiting on behalf of both the guest and the host would clearly be better. If we can do mwait for both the guest and host (say they share a 512 bit region), then the host will need some protection from the guest. Maybe the waking guest-thread could just do a hypercall to wake up the remote vCPU? Or maybe it could poke the monitored region, but that is handled as a special page-fault? The hypercall-to-wake would also allow us to move guest-threads across CPUs. That said, I'm not sure how expensive either of these would be. Assuming host/guest can share a monitored region safely, the host's idle could monitor some region other than its &thread_info->flags. Maybe we could setup a mwait notifier with a percpu waiting area which could be registered by idle, guests etc. Though on second thoughts, if the remote thread will do a hypercall/page-fault then the handling could just as easily be: mark the guest's remote thread runnable and set the resched bit. High level semantics: If the CPU doesn't have any runnable threads, then we actually do this version of PV-MWAIT -- arming a timer if necessary so we only sleep until the time-slice expires or the MWAIT max_delay does. That would kill the sched_wake_idle_without_ipi optimization for the host. Ye
Re: [Xen-devel] [RFC PATCH 04/16] x86/xen: hypercall support for xenhost_t
On 2019-06-12 2:15 p.m., Andrew Cooper wrote: On 09/05/2019 18:25, Ankur Arora wrote: Allow for different hypercall implementations for different xenhost types. Nested xenhost, which has two underlying xenhosts, can use both simultaneously. The hypercall macros (HYPERVISOR_*) implicitly use the default xenhost.x A new macro (hypervisor_*) takes xenhost_t * as a parameter and does the right thing. TODO: - Multicalls for now assume the default xenhost - xen_hypercall_* symbols are only generated for the default xenhost. Signed-off-by: Ankur Arora Again, what is the hypervisor nesting and/or guest layout here? Two hypervisors, L0 and L1, and the guest is a child of the L1 hypervisor but could have PV devices attached to both L0 and L1 hypervisors. I can't think of any case where a single piece of software can legitimately have two hypercall pages, because if it has one working one, it is by definition a guest, and therefore not privileged enough to use the outer one. Depending on which hypercall page is used, the hypercall would (eventually) land in the corresponding hypervisor. Juergen elsewhere pointed out proxying hypercalls is a better approach, so I'm not really considering this any more but, given this layout, and assuming that the hypercall pages could be encoded differently would it still not work? Ankur ~Andrew ___ Xen-devel mailing list xen-de...@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [RFC PATCH 09/16] xen/evtchn: support evtchn in xenhost_t
On 2019-06-14 5:04 a.m., Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Largely mechanical patch that adds a new param, xenhost_t * to the evtchn interfaces. The evtchn port instead of being domain unique, is now scoped to xenhost_t. As part of upcall handling we now look at all the xenhosts and, for evtchn_2l, the xenhost's shared_info and vcpu_info. Other than this event handling is largley unchanged. Note that the IPI, timer, VIRQ, FUNCTION, PMU etc vectors remain attached to xh_default. Only interdomain evtchns are allowable as xh_remote. I'd do only the interface changes for now (including evtchn FIFO). Looking at this patch again, it seems to me that it would be best to limit the interface change (to take the xenhost_t * parameter) only to bind_interdomain_*. That also happily limits the change to the drivers/ subtree. The main difference will be how to call the hypervisor for sending an event (either direct or via a passthrough-hypercall). Yeah, though, this would depend on how the evtchns are mapped (if it's the L1-Xen which is responsible for mapping the evtchn on behalf of the L0-Xen, then notify_remote_via_evtchn() could just stay the same.) Still, I'll add a send interface (perhaps just an inline function) to the xenhost interface for this. Ankur Juergen
Re: [RFC PATCH 07/16] x86/xen: make vcpu_info part of xenhost_t
On 2019-06-14 4:53 a.m., Juergen Gross wrote: On 09.05.19 19:25, Ankur Arora wrote: Abstract out xen_vcpu_id probing via (*probe_vcpu_id)(). Once that is availab,e the vcpu_info registration happens via the VCPUOP hypercall. Note that for the nested case, there are two vcpu_ids, and two vcpu_info areas, one each for the default xenhost and the remote xenhost. The vcpu_info is used via pv_irq_ops, and evtchn signaling. The other VCPUOP hypercalls are used for management (and scheduling) which is expected to be done purely in the default hypervisor. However, scheduling of L1-guest does imply L0-Xen-vcpu_info switching, which might mean that the remote hypervisor needs some visibility into related events/hypercalls in the default hypervisor. Another candidate for dropping due to layering violation, I guess. Yeah, a more narrowly tailored interface, where perhaps the L1-Xen maps events for L0-Xen makes sense. Also, just realized that given that L0-Xen has no control over scheduling of L1-Xen's guests (some of which it might want to send events to), it makes sense for L1-Xen to have some state for guest evtchns which pertain to L0-Xen. Ankur Juergen
Re: [Xen-devel] [PATCH 0/5] xen/pvh*: Support > 32 VCPUs at restore
On 2017-06-08 03:53 PM, Konrad Rzeszutek Wilk wrote: On Thu, Jun 08, 2017 at 10:28:15AM +0200, Juergen Gross wrote: On 03/06/17 02:05, Ankur Arora wrote: This patch series fixes a bunch of issues in the xen_vcpu setup logic. Simplify xen_vcpu related code: code refactoring in advance of the rest of the patch series. Support > 32 VCPUs at restore: unify all vcpu restore logic in xen_vcpu_restore() and support > 32 VCPUs for PVH*. Remove vcpu info placement from restore (!SMP): some pv_ops are marked RO after init so lets not redo xen_setup_vcpu_info_placement at restore. Handle xen_vcpu_setup() failure in hotplug: handle vcpu_info registration failures by propagating them from the cpuhp-prepare callback back up to the cpuhp logic. Handle xen_vcpu_setup() failure at boot: pull CPUs (> MAX_VIRT_CPUS) down if we fall back to xen_have_vcpu_info_placement = 0. Tested with various combinations of PV/PVHv2/PVHVM save/restore and cpu-hotadd-hotremove. Also tested by simulating failure in VCPUOP_register_vcpu_info. Please review. Just a question regarding the sequence of tags (Reviewed-by: and Signed-off-by:) in the patches: It seems a little bit odd to have the Reviewed-by: tag before the S-o-b: tag. This suggests the review was done before you wrote the patches, which is hard to believe. :-) Heh :). As Konrad surmises, I was unsure of the order and manually ordered them to comport with Linux style. (Now that I see arch/x86/xen/, I see that Xen puts them in time-order.) Happy to reorder in case of V2. Ankur That is how the Linux orders the tags, just do 'git log' and you will see that pattern >> So please reorder the tags in future patches to be in their logical sequence. While Xen orders it in the other order (SoB first, then Reviewed-by). I can fix this up in this series in case there is no need for V2. Juergen Ankur Arora (5): xen/vcpu: Simplify xen_vcpu related code xen/pvh*: Support > 32 VCPUs at domain restore xen/pv: Fix OOPS on restore for a PV, !SMP domain xen/vcpu: Handle xen_vcpu_setup() failure in hotplug xen/vcpu: Handle xen_vcpu_setup() failure at boot arch/x86/xen/enlighten.c | 154 +++ arch/x86/xen/enlighten_hvm.c | 33 -- arch/x86/xen/enlighten_pv.c | 87 +++- arch/x86/xen/smp.c | 31 + arch/x86/xen/smp.h | 2 + arch/x86/xen/smp_hvm.c | 14 +++- arch/x86/xen/smp_pv.c| 6 +- arch/x86/xen/suspend_hvm.c | 11 +--- arch/x86/xen/xen-ops.h | 3 +- include/xen/xen-ops.h| 2 + 10 files changed, 218 insertions(+), 125 deletions(-) ___ Xen-devel mailing list xen-de...@lists.xen.org https://lists.xen.org/xen-devel
[PATCH 1/5] xen/vcpu: Simplify xen_vcpu related code
Largely mechanical changes to aid unification of xen_vcpu_restore() logic for PV, PVH and PVHVM. xen_vcpu_setup(): the only change in logic is that clamp_max_cpus() is now handled inside the "if (!xen_have_vcpu_info_placement)" block. xen_vcpu_restore(): code movement from enlighten_pv.c to enlighten.c. xen_vcpu_info_reset(): pulls together all the code where xen_vcpu is set to default. Reviewed-by: Boris Ostrovsky Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten.c | 101 +++ arch/x86/xen/enlighten_hvm.c | 6 +-- arch/x86/xen/enlighten_pv.c | 47 +--- arch/x86/xen/smp_hvm.c | 3 +- arch/x86/xen/xen-ops.h | 1 + 5 files changed, 89 insertions(+), 69 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a5ffcbb20cc0..96b745e3f56c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -106,6 +106,35 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), + NULL); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + xen_setup_runstate_info(cpu); + + if (xen_have_vcpu_info_placement) + xen_vcpu_setup(cpu); + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } +} + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -114,6 +143,17 @@ static void clamp_max_cpus(void) #endif } +void xen_vcpu_info_reset(int cpu) +{ + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } else { + /* Set to NULL so that if somebody accesses it we get an OOPS */ + per_cpu(xen_vcpu, cpu) = NULL; + } +} + void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; @@ -137,40 +177,45 @@ void xen_vcpu_setup(int cpu) if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) return; } - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + + xen_vcpu_info_reset(cpu); + + if (xen_have_vcpu_info_placement) { + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + /* +* Check to see if the hypervisor will put the vcpu_info +* structure where we want it, which allows direct access via +* a percpu-variable. +* N.B. This hypercall can _only_ be called once per CPU. +* Subsequent calls will error out with -EINVAL. This is due to +* the fact that hypervisor has no unregister variant and this +* hypercall does not allow to over-write info.mfn and +* info.offset. +*/ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, +xen_vcpu_nr(cpu), &info); + + if (err) { + pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", +cpu, err); + xen_have_vcpu_info_placement = 0; + } else { + /* +* This cpu is using the registered vcpu info, even if +* later ones fail to. +*/ + per_cpu(xen_vcpu, cpu) = vcpup; + } + } if (!xen_have_vcpu_info_placement) { if (cpu >= MAX_VIRT_CPUS) clamp_max_cpus(); return; } - - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - /* Check to see if the hypervisor will put the vcpu_info - structure where we want it, which allows direct access via - a percpu-variable. - N.B. This hypercall can _only_ be called once per CPU. Subsequent - calls will error out with -EINV
[PATCH 2/5] xen/pvh*: Support > 32 VCPUs at domain restore
When Xen restores a PVHVM or PVH guest, its shared_info only holds up to 32 CPUs. The hypercall VCPUOP_register_vcpu_info allows us to setup per-page areas for VCPUs. This means we can boot PVH* guests with more than 32 VCPUs. During restore the per-cpu structure is allocated freshly by the hypervisor (vcpu_info_mfn is set to INVALID_MFN) so that the newly restored guest can make a VCPUOP_register_vcpu_info hypercall. However, we end up triggering this condition in Xen: /* Run this command on yourself or on other offline VCPUS. */ if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) ) which means we are unable to setup the per-cpu VCPU structures for running VCPUS. The Linux PV code paths makes this work by iterating over cpu_possible in xen_vcpu_restore() with: 1) is target CPU up (VCPUOP_is_up hypercall?) 2) if yes, then VCPUOP_down to pause it 3) VCPUOP_register_vcpu_info 4) if it was down, then VCPUOP_up to bring it back up With Xen commit 192df6f9122d ("xen/x86: allow HVM guests to use hypercalls to bring up vCPUs") this is available for non-PV guests. As such first check if VCPUOP_is_up is actually possible before trying this dance. As most of this dance code is done already in xen_vcpu_restore() let's make it callable on PV, PVH and PVHVM. Based-on-patch-by: Konrad Wilk Reviewed-by: Boris Ostrovsky Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten.c | 45 +++- arch/x86/xen/enlighten_hvm.c | 20 +++- arch/x86/xen/smp_hvm.c | 10 ++ arch/x86/xen/suspend_hvm.c | 11 +++ include/xen/xen-ops.h| 2 ++ 5 files changed, 54 insertions(+), 34 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 96b745e3f56c..276cc21619ec 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -106,6 +106,21 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } +static void xen_vcpu_setup_restore(int cpu) +{ + /* Any per_cpu(xen_vcpu) is stale, so reset it */ + xen_vcpu_info_reset(cpu); + + /* +* For PVH and PVHVM, setup online VCPUs only. The rest will +* be handled by hotplug. +*/ + if (xen_pv_domain() || + (xen_hvm_domain() && cpu_online(cpu))) { + xen_vcpu_setup(cpu); + } +} + /* * On restore, set the vcpu placement up again. * If it fails, then we're in a bad state, since @@ -117,17 +132,23 @@ void xen_vcpu_restore(void) for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); + bool is_up; + + if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID) + continue; + + /* Only Xen 4.5 and higher support this. */ + is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL) > 0; if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) BUG(); - xen_setup_runstate_info(cpu); + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_runstate_info(cpu); - if (xen_have_vcpu_info_placement) - xen_vcpu_setup(cpu); + xen_vcpu_setup_restore(cpu); if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) @@ -163,11 +184,11 @@ void xen_vcpu_setup(int cpu) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* -* This path is called twice on PVHVM - first during bootup via -* smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being -* hotplugged: cpu_up -> xen_hvm_cpu_notify. -* As we can only do the VCPUOP_register_vcpu_info once lets -* not over-write its result. +* This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu) +* and at restore (xen_vcpu_restore). Also called for hotplugged +* VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm). +* However, the hypercall can only be done once (see below) so if a VCPU +* is offlined and comes back online then let's not redo the hypercall. * * For PV it is called during restore (xen_vcpu_restore) and bootup * (xen_setup_vcpu_info_placement). The hotplug mechanism does not @@ -178,8 +199,6 @@ void xen_vcpu_setup(int cpu) return; } - xen_vcpu_info_reset(cpu); - if (xen_have_vcpu_info_placement) { vcpup = &per_cpu(xen_vcpu_info, cpu);
[PATCH 4/5] xen/vcpu: Handle xen_vcpu_setup() failure in hotplug
The hypercall VCPUOP_register_vcpu_info can fail. This failure is handled by making per_cpu(xen_vcpu, cpu) point to its shared_info slot and those without one (cpu >= MAX_VIRT_CPUS) be NULL. For PVH/PVHVM, this is not enough, because we also need to pull these VCPUs out of circulation. Fix for PVH/PVHVM: on registration failure in the cpuhp prepare callback (xen_cpu_up_prepare_hvm()), return an error to the cpuhp state-machine so it can fail the CPU init. Fix for PV: the registration happens before smp_init(), so, in the failure case we clamp setup_max_cpus and limit the number of VCPUs that smp_init() will bring-up to MAX_VIRT_CPUS. This is functionally correct but it makes the code a bit simpler if we get rid of this explicit clamping: for VCPUs that don't have valid xen_vcpu, fail the CPU init in the cpuhp prepare callback (xen_cpu_up_prepare_pv()). Reviewed-by: Boris Ostrovsky Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten.c | 46 +--- arch/x86/xen/enlighten_hvm.c | 9 + arch/x86/xen/enlighten_pv.c | 14 +- arch/x86/xen/xen-ops.h | 2 +- 4 files changed, 45 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 276cc21619ec..0e7ef69e8531 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -106,8 +106,10 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static void xen_vcpu_setup_restore(int cpu) +static int xen_vcpu_setup_restore(int cpu) { + int rc = 0; + /* Any per_cpu(xen_vcpu) is stale, so reset it */ xen_vcpu_info_reset(cpu); @@ -117,8 +119,10 @@ static void xen_vcpu_setup_restore(int cpu) */ if (xen_pv_domain() || (xen_hvm_domain() && cpu_online(cpu))) { - xen_vcpu_setup(cpu); + rc = xen_vcpu_setup(cpu); } + + return rc; } /* @@ -128,7 +132,7 @@ static void xen_vcpu_setup_restore(int cpu) */ void xen_vcpu_restore(void) { - int cpu; + int cpu, rc; for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); @@ -148,22 +152,25 @@ void xen_vcpu_restore(void) if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_runstate_info(cpu); - xen_vcpu_setup_restore(cpu); - - if (other_cpu && is_up && + rc = xen_vcpu_setup_restore(cpu); + if (rc) + pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " + "System will hang.\n", cpu, rc); + /* +* In case xen_vcpu_setup_restore() fails, do not bring up the +* VCPU. This helps us avoid the resulting OOPS when the VCPU +* accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) +* Note that this does not improve the situation much -- now the +* VM hangs instead of OOPSing -- with the VCPUs that did not +* fail, spinning in stop_machine(), waiting for the failed +* VCPUs to come up. +*/ + if (other_cpu && is_up && (rc == 0) && HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) BUG(); } } -static void clamp_max_cpus(void) -{ -#ifdef CONFIG_SMP - if (setup_max_cpus > MAX_VIRT_CPUS) - setup_max_cpus = MAX_VIRT_CPUS; -#endif -} - void xen_vcpu_info_reset(int cpu) { if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { @@ -175,7 +182,7 @@ void xen_vcpu_info_reset(int cpu) } } -void xen_vcpu_setup(int cpu) +int xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -196,7 +203,7 @@ void xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return; + return 0; } if (xen_have_vcpu_info_placement) { @@ -230,11 +237,10 @@ void xen_vcpu_setup(int cpu) } } - if (!xen_have_vcpu_info_placement) { - if (cpu >= MAX_VIRT_CPUS) - clamp_max_cpus(); + if (!xen_have_vcpu_info_placement) xen_vcpu_info_reset(cpu); - } + + return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } void xen_reboot(int reason) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index ba1afadb2512..13b5fa1a211c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -89,7 +89,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs) static int xen_cpu_up_prepare_hvm(unsigned int cpu) { - int rc;
[PATCH 5/5] xen/vcpu: Handle xen_vcpu_setup() failure at boot
On PVH, PVHVM, at failure in the VCPUOP_register_vcpu_info hypercall we limit the number of cpus to to MAX_VIRT_CPUS. However, if this failure had occurred for a cpu beyond MAX_VIRT_CPUS, we continue to function with > MAX_VIRT_CPUS. This leads to problems at the next save/restore cycle when there are > MAX_VIRT_CPUS threads going into stop_machine() but coming back up there's valid state for only the first MAX_VIRT_CPUS. This patch pulls the excess CPUs down via cpu_down(). Reviewed-by: Boris Ostrovsky Signed-off-by: Ankur Arora --- arch/x86/xen/smp.c | 31 +++ arch/x86/xen/smp.h | 2 ++ arch/x86/xen/smp_hvm.c | 1 + arch/x86/xen/smp_pv.c | 6 +- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 82ac611f2fc1..e7f02eb73727 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -114,6 +115,36 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } +void __init xen_smp_cpus_done(unsigned int max_cpus) +{ + int cpu, rc, count = 0; + + if (xen_hvm_domain()) + native_smp_cpus_done(max_cpus); + + if (xen_have_vcpu_info_placement) + return; + + for_each_online_cpu(cpu) { + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) + continue; + + rc = cpu_down(cpu); + + if (rc == 0) { + /* +* Reset vcpu_info so this cpu cannot be onlined again. +*/ + xen_vcpu_info_reset(cpu); + count++; + } else { + pr_warn("%s: failed to bring CPU %d down, error %d\n", + __func__, cpu, rc); + } + } + WARN(count, "%s: brought %d CPUs offline\n", __func__, count); +} + void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 8ebb6acca64a..87d3c76cba37 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -14,6 +14,8 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + void xen_smp_send_reschedule(int cpu); void xen_smp_send_call_function_ipi(const struct cpumask *mask); void xen_smp_send_call_function_single_ipi(int cpu); diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index 6c8a805819ff..fd60abedf658 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -71,4 +71,5 @@ void __init xen_hvm_smp_init(void) smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index aae32535f4ec..1ea598e5f030 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -371,10 +371,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -static void xen_pv_smp_cpus_done(unsigned int max_cpus) -{ -} - #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -469,7 +465,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, - .smp_cpus_done = xen_pv_smp_cpus_done, + .smp_cpus_done = xen_smp_cpus_done, .cpu_up = xen_pv_cpu_up, .cpu_die = xen_pv_cpu_die, -- 2.7.4
[PATCH 0/5] xen/pvh*: Support > 32 VCPUs at restore
This patch series fixes a bunch of issues in the xen_vcpu setup logic. Simplify xen_vcpu related code: code refactoring in advance of the rest of the patch series. Support > 32 VCPUs at restore: unify all vcpu restore logic in xen_vcpu_restore() and support > 32 VCPUs for PVH*. Remove vcpu info placement from restore (!SMP): some pv_ops are marked RO after init so lets not redo xen_setup_vcpu_info_placement at restore. Handle xen_vcpu_setup() failure in hotplug: handle vcpu_info registration failures by propagating them from the cpuhp-prepare callback back up to the cpuhp logic. Handle xen_vcpu_setup() failure at boot: pull CPUs (> MAX_VIRT_CPUS) down if we fall back to xen_have_vcpu_info_placement = 0. Tested with various combinations of PV/PVHv2/PVHVM save/restore and cpu-hotadd-hotremove. Also tested by simulating failure in VCPUOP_register_vcpu_info. Please review. Ankur Arora (5): xen/vcpu: Simplify xen_vcpu related code xen/pvh*: Support > 32 VCPUs at domain restore xen/pv: Fix OOPS on restore for a PV, !SMP domain xen/vcpu: Handle xen_vcpu_setup() failure in hotplug xen/vcpu: Handle xen_vcpu_setup() failure at boot arch/x86/xen/enlighten.c | 154 +++ arch/x86/xen/enlighten_hvm.c | 33 -- arch/x86/xen/enlighten_pv.c | 87 +++- arch/x86/xen/smp.c | 31 + arch/x86/xen/smp.h | 2 + arch/x86/xen/smp_hvm.c | 14 +++- arch/x86/xen/smp_pv.c| 6 +- arch/x86/xen/suspend_hvm.c | 11 +--- arch/x86/xen/xen-ops.h | 3 +- include/xen/xen-ops.h| 2 + 10 files changed, 218 insertions(+), 125 deletions(-) -- 2.7.4
[PATCH 3/5] xen/pv: Fix OOPS on restore for a PV, !SMP domain
If CONFIG_SMP is disabled, xen_setup_vcpu_info_placement() is called from xen_setup_shared_info(). This is fine as far as boot goes, but it means that we also call it in the restore path. This results in an OOPS because we assign to pv_mmu_ops.read_cr2 which is __ro_after_init. Also, though less problematically, this means we call xen_vcpu_setup() twice at restore -- once from the vcpu info placement call and the second time from xen_vcpu_restore(). Fix by calling xen_setup_vcpu_info_placement() at boot only. Reviewed-by: Boris Ostrovsky Signed-off-by: Ankur Arora --- arch/x86/xen/enlighten_pv.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f51e48299692..29cad193db53 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -938,23 +938,27 @@ void xen_setup_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - xen_setup_mfn_list_list(); - /* -* Now that shared info is set up we can start using routines that -* point to pvclock area. -*/ - if (system_state == SYSTEM_BOOTING) + if (system_state == SYSTEM_BOOTING) { +#ifndef CONFIG_SMP + /* +* In UP this is as good a place as any to set up shared info. +* Limit this to boot only, at restore vcpu setup is done via +* xen_vcpu_restore(). +*/ + xen_setup_vcpu_info_placement(); +#endif + /* +* Now that shared info is set up we can start using routines +* that point to pvclock area. +*/ xen_init_time_ops(); + } } /* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) +void __ref xen_setup_vcpu_info_placement(void) { int cpu; -- 2.7.4