Thanks for the review, Marc! Sorry for the late reply, some of your questions took me some time to learn and think.
On Fri, May 16, 2025 at 8:20 AM Marc Zyngier <m...@kernel.org> wrote: > > On Mon, 05 May 2025 17:14:07 +0100, > Jiaqi Yan <jiaqi...@google.com> wrote: > > > > When APEI fails to handle a stage2 abort that is synchronous external > > abort (SEA), > > nit: "a stage-2 synchronous external abort". Will fix in V2. > > > today KVM directly injects an async SError to the VCPU > > then resumes it, which usually results in unpleasant guest kernel panic. > > Which is a perfectly legal thing to do, and not a violation of the > architecture. Absolutely, this commit is just an alternative (and hopefully better option) to directly injecting SError. > > > One major situation of guest SEA is when vCPU consumes recoverable > > uncorrected memory error (UER). Although SError and guest kernel panic > > effectively stops the propagation of corrupted memory, there is still > > room to recover from memory UER in a more graceful manner. > > "there is room to recover from an UER..." Will reword in V2. > > > > Alternatively KVM can redirect the synchronous SEA event to VMM to > > - Reduce blast radius if possible. VMM can inject a SEA to VCPU via > > KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison > > consumption or fault is not from guest kernel, blast radius can be > > limited to the triggering thread in guest userspace, so VM can > > keep running. > > - VMM can protect from future memory poison consumption by unmapping > > the page from stage-2 with KVM userfault [1]. VMM can also > > track SEA events that VM customer cares about, restart VM when > > certain number of distinct poison events happened, provide > > observability to customers [2]. > > > > Introduce following userspace-visible features to make VMM handle SEA: > > - KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior > > when host APEI fails to claim a SEA, userspace can opt in this new > > capability to let KVM exit to userspace during synchronous abort. > > - KVM_EXIT_ARM_SEA. A new exit reason is introduced for this, and > > KVM fills kvm_run.arm_sea with as much as possible information about > > the SEA, including > > - ESR_EL2. > > - If faulting guest virtual and physical addresses are available. > > - Faulting guest virtual address if available. > > - Faulting guest physical address if available. > > > > [1] > > https://lpc.events/event/18/contributions/1757/attachments/1442/3073/LPC_%20KVM%20Userfault.pdf > > [2] https://cloud.google.com/solutions/sap/docs/manage-host-errors > > I really don't think we need these link in a commit message (they are > likely to vanish, specially the second one). Either the information is > pertinent and it needs to be added to the commit message, or removed > altogether. I don't think the SAP thing makes much sense as is. > Will reword in V2 with links removed. > > > > Signed-off-by: Jiaqi Yan <jiaqi...@google.com> > > --- > > arch/arm64/include/asm/kvm_emulate.h | 12 +++++++ > > arch/arm64/include/asm/kvm_host.h | 8 +++++ > > arch/arm64/include/asm/kvm_ras.h | 21 ++++------- > > arch/arm64/kvm/Makefile | 3 +- > > arch/arm64/kvm/arm.c | 5 +++ > > arch/arm64/kvm/kvm_ras.c | 54 ++++++++++++++++++++++++++++ > > arch/arm64/kvm/mmu.c | 12 ++----- > > include/uapi/linux/kvm.h | 11 ++++++ > > 8 files changed, 101 insertions(+), 25 deletions(-) > > create mode 100644 arch/arm64/kvm/kvm_ras.c > > > > diff --git a/arch/arm64/include/asm/kvm_emulate.h > > b/arch/arm64/include/asm/kvm_emulate.h > > index bd020fc28aa9c..a9de30478a088 100644 > > --- a/arch/arm64/include/asm/kvm_emulate.h > > +++ b/arch/arm64/include/asm/kvm_emulate.h > > @@ -429,6 +429,18 @@ static __always_inline bool kvm_vcpu_abt_issea(const > > struct kvm_vcpu *vcpu) > > } > > } > > > > +/* Return true if FAR holds valid faulting guest virtual address. */ > > +static inline bool kvm_vcpu_sea_far_valid(const struct kvm_vcpu *vcpu) > > +{ > > + return !(kvm_vcpu_get_esr(vcpu) & ESR_ELx_FnV); > > +} > > + > > +/* Return true if HPFAR_EL2 holds valid faulting guest physical address. */ > > +static inline bool kvm_vcpu_sea_ipa_valid(const struct kvm_vcpu *vcpu) > > +{ > > + return vcpu->arch.fault.hpfar_el2 & HPFAR_EL2_NS; > > +} > > + > > static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) > > { > > u64 esr = kvm_vcpu_get_esr(vcpu); > > diff --git a/arch/arm64/include/asm/kvm_host.h > > b/arch/arm64/include/asm/kvm_host.h > > index 73b7762b0e7d1..e0129f9799f80 100644 > > --- a/arch/arm64/include/asm/kvm_host.h > > +++ b/arch/arm64/include/asm/kvm_host.h > > @@ -342,6 +342,14 @@ struct kvm_arch { > > #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 > > /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ > > #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 > > + /* > > + * When APEI failed to claim stage-2 synchronous external abort > > + * (SEA) return to userspace with fault information. Userspace > > + * can opt in this feature if KVM_CAP_ARM_SEA_TO_USER is > > + * supported. Userspace is encouraged to handle this VM exit > > + * by injecting a SEA to VCPU before resume the VCPU. > > + */ > > +#define KVM_ARCH_FLAG_RETURN_SEA_TO_USER 11 > > unsigned long flags; > > > > /* VM-wide vCPU feature set */ > > diff --git a/arch/arm64/include/asm/kvm_ras.h > > b/arch/arm64/include/asm/kvm_ras.h > > index 9398ade632aaf..a2fd91af8f97e 100644 > > --- a/arch/arm64/include/asm/kvm_ras.h > > +++ b/arch/arm64/include/asm/kvm_ras.h > > @@ -4,22 +4,15 @@ > > #ifndef __ARM64_KVM_RAS_H__ > > #define __ARM64_KVM_RAS_H__ > > > > -#include <linux/acpi.h> > > -#include <linux/errno.h> > > -#include <linux/types.h> > > - > > -#include <asm/acpi.h> > > +#include <linux/kvm_host.h> > > > > /* > > - * Was this synchronous external abort a RAS notification? > > - * Returns '0' for errors handled by some RAS subsystem, or -ENOENT. > > + * Handle stage2 synchronous external abort (SEA) in the following order: > > + * 1. Delegate to APEI/GHES and if they can claim SEA, resume guest. > > + * 2. If userspace opt-ed in KVM_CAP_ARM_SEA_TO_USER, exit to userspace > > + * with details about the SEA. > > + * 3. Otherwise, inject async SError into the VCPU and resume guest. > > */ > > -static inline int kvm_handle_guest_sea(void) > > -{ > > - /* apei_claim_sea(NULL) expects to mask interrupts itself */ > > - lockdep_assert_irqs_enabled(); > > - > > - return apei_claim_sea(NULL); > > -} > > +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu); > > > > #endif /* __ARM64_KVM_RAS_H__ */ > > diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile > > index 209bc76263f10..785d568411e88 100644 > > --- a/arch/arm64/kvm/Makefile > > +++ b/arch/arm64/kvm/Makefile > > @@ -23,7 +23,8 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ > > vgic/vgic-v3.o vgic/vgic-v4.o \ > > vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \ > > vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ > > - vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o > > + vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \ > > + kvm_ras.o > > > > kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o > > kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o > > diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c > > index 19ca57def6292..47544945fba45 100644 > > --- a/arch/arm64/kvm/arm.c > > +++ b/arch/arm64/kvm/arm.c > > @@ -133,6 +133,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > > } > > mutex_unlock(&kvm->lock); > > break; > > + case KVM_CAP_ARM_SEA_TO_USER: > > + r = 0; > > + set_bit(KVM_ARCH_FLAG_RETURN_SEA_TO_USER, &kvm->arch.flags); > > + break; > > default: > > break; > > } > > @@ -322,6 +326,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long > > ext) > > case KVM_CAP_IRQFD_RESAMPLE: > > case KVM_CAP_COUNTER_OFFSET: > > case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: > > + case KVM_CAP_ARM_SEA_TO_USER: > > r = 1; > > break; > > case KVM_CAP_SET_GUEST_DEBUG2: > > diff --git a/arch/arm64/kvm/kvm_ras.c b/arch/arm64/kvm/kvm_ras.c > > new file mode 100644 > > index 0000000000000..83f2731c95d77 > > --- /dev/null > > +++ b/arch/arm64/kvm/kvm_ras.c > > I don't think we need a new file for so little code. Please leave it > with the MMU code for now. Agreed, will do in V2. > > > @@ -0,0 +1,54 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > + > > +#include <linux/acpi.h> > > +#include <linux/types.h> > > +#include <asm/acpi.h> > > +#include <asm/kvm_emulate.h> > > +#include <asm/kvm_ras.h> > > +#include <asm/system_misc.h> > > + > > +/* > > + * Was this synchronous external abort a RAS notification? > > + * Returns 0 for errors handled by some RAS subsystem, or -ENOENT. > > + */ > > +static int kvm_delegate_guest_sea(void) > > +{ > > + /* apei_claim_sea(NULL) expects to mask interrupts itself. */ > > + lockdep_assert_irqs_enabled(); > > + return apei_claim_sea(NULL); > > +} > > + > > +int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_run *run = vcpu->run; > > + bool exit = test_bit(KVM_ARCH_FLAG_RETURN_SEA_TO_USER, > > + &vcpu->kvm->arch.flags); > > + > > + /* For RAS the host kernel may handle this abort. */ > > + if (kvm_delegate_guest_sea() == 0) > > + return 1; > > + > > + if (!exit) { > > Move the condition here, since nothing else evaluates "exit". Will do in V2. > > > + /* Fallback behavior prior to KVM_EXIT_ARM_SEA. */ > > + kvm_inject_vabt(vcpu); > > + return 1; > > + } > > + > > + run->exit_reason = KVM_EXIT_ARM_SEA; > > + run->arm_sea.esr = kvm_vcpu_get_esr(vcpu); > > Do we *always* want to report this without any sanitisation? ESR_EL2 > contains a lot of things that we may want to hide. You are right, we should sanitize ESR_EL2. The general rule I followed is to keep only the SEA-relevent bits that are useful for userspace, and relevant to guest memory. Below is a rundown of the SEA bits. Hide the following bits: - HDBSSF: hardware dirty state is not guest memory. - TnD, TagAccess, AssuredOnly, Overlay, DirtyBit: they are for permission faults, not SEA - GCS: not guest memory. - Xs: it is for translation/access flag/permission fault. - Summing up the bits above, ISS2 is entirely hidden. - ISV and ISS[23:14]: Situation for ISV is a little complicated. ISV is 1 mostly for Translation fault, Access flag fault, or Permission fault. When FEAT_RAS is implemented ISV=0; when FEAT_RAS is not implemented, ISV may (implementation defined) be 1 for S2PTW, which is not worthy to return to userspace (see my reply below about S1PTW and S2PTW). - VNCR: VNCR_EL2 is not guest memory. Report the following bits because they tell userspace useful info - EC: tell if abort on instruction or data. - IL: useful if userspace decides to retire the instruction. - FSC: tell if abort on translation table walk. - SET or [12:11]: tell if abort is recoverable, uncontainable, or restartable. - S1PTW: userspace can tell the guest kernel its stage-1 has a problem. - FnV: userspace should avoid writing FAR_EL1 if FnV=1. - CM and WnR: make ESR "authentic" in general. I try to be thorough but please point out anything missing. > > For example, what do we do on a SEA generated by a S1PTW? I don't > think it makes any sense to report it as such, if only because > userspace knows nothing of stage-2. For S1PTW=1, i.e. the memory used for stage-1 is poisoned or broken, I think "returning to VMM and letting VMM inject SEA to VM" a better virtualization (reflect the corresponding bare-metal behavior) compared to injecting SError. However if memory allocated to stage-2 is poisoned or broken, which eventually will cause an exception at EL2 and crash host, returning to VMM doesn't seem to be beneficial. I can add a check for S2PTW and if so, just keep the old behavior of injecting SError. > > > + run->arm_sea.flags = 0ULL; > > + run->arm_sea.gva = 0ULL; > > + run->arm_sea.gpa = 0ULL; > > + > > + if (kvm_vcpu_sea_far_valid(vcpu)) { > > + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GVA_VALID; > > + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); > > + } > > Under which circumstances is the guest VA of interest to userspace? I > can imagine *something*, but it'd be good to document it. Guest VA/PA are interesting to VMM if VMM wants to emulate SEA by itself, and write FAR_EL1 + HPFAR_EL1 registers. Will add a comment around here. > > > + > > + if (kvm_vcpu_sea_ipa_valid(vcpu)) { > > + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; > > + run->arm_sea.gpa = kvm_vcpu_get_fault_ipa(vcpu); > > + } > > + > > + return 0; > > +} > > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > > index 754f2fe0cc673..a605ee56fa150 100644 > > --- a/arch/arm64/kvm/mmu.c > > +++ b/arch/arm64/kvm/mmu.c > > @@ -1795,16 +1795,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) > > int ret, idx; > > > > /* Synchronous External Abort? */ > > - if (kvm_vcpu_abt_issea(vcpu)) { > > - /* > > - * For RAS the host kernel may handle this abort. > > - * There is no need to pass the error into the guest. > > - */ > > - if (kvm_handle_guest_sea()) > > - kvm_inject_vabt(vcpu); > > - > > - return 1; > > - } > > + if (kvm_vcpu_abt_issea(vcpu)) > > + return kvm_handle_guest_sea(vcpu); > > > > esr = kvm_vcpu_get_esr(vcpu); > > > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > > index b6ae8ad8934b5..79dc4676ff74b 100644 > > --- a/include/uapi/linux/kvm.h > > +++ b/include/uapi/linux/kvm.h > > @@ -178,6 +178,7 @@ struct kvm_xen_exit { > > #define KVM_EXIT_NOTIFY 37 > > #define KVM_EXIT_LOONGARCH_IOCSR 38 > > #define KVM_EXIT_MEMORY_FAULT 39 > > +#define KVM_EXIT_ARM_SEA 40 > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > /* Emulate instruction failed. */ > > @@ -446,6 +447,15 @@ struct kvm_run { > > __u64 gpa; > > __u64 size; > > } memory_fault; > > + /* KVM_EXIT_ARM_SEA */ > > + struct { > > + __u64 esr; > > +#define KVM_EXIT_ARM_SEA_FLAG_GVA_VALID (1ULL << 0) > > +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 1) > > + __u64 flags; > > + __u64 gva; > > + __u64 gpa; > > + } arm_sea; > > /* Fix the size of the union. */ > > char padding[256]; > > }; > > @@ -930,6 +940,7 @@ struct kvm_enable_cap { > > #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 > > #define KVM_CAP_X86_GUEST_MODE 238 > > #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 > > +#define KVM_CAP_ARM_SEA_TO_USER 240 > > > > struct kvm_irq_routing_irqchip { > > __u32 irqchip; > > Thanks, > > M. > > -- > Without deviation from the norm, progress is not possible.