On Wed, Dec 16, 2015 at 11:38:37AM +0530, Aravinda Prasad wrote: > Memory error such as bit flips that cannot be corrected > by hardware are passed on to the kernel for handling. > If the memory address in error belongs to guest then > guest kernel is responsible for taking suitable action. > Patch [1] enhances KVM to exit guest with exit reason > set to KVM_EXIT_NMI in such cases. > > This patch handles KVM_EXIT_NMI exit. If the guest OS > has registered the machine check handling routine by > calling "ibm,nmi-register", then the handler builds > the error log and invokes the registered handler else > invokes the handler at 0x200. > > [1] http://marc.info/?l=kvm&m=145024538523497 > > Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com> > --- > cpus.c | 5 +++ > hw/ppc/spapr.c | 6 +++ > include/qemu/main-loop.h | 8 ++++ > target-ppc/kvm.c | 86 > ++++++++++++++++++++++++++++++++++++++++++++++ > target-ppc/kvm_ppc.h | 81 +++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 186 insertions(+) > > diff --git a/cpus.c b/cpus.c > index dddd056..7b7dd0f 100644 > --- a/cpus.c > +++ b/cpus.c > @@ -1154,6 +1154,11 @@ void qemu_mutex_unlock_iothread(void) > qemu_mutex_unlock(&qemu_global_mutex); > } > > +void qemu_cond_wait_iothread(QemuCond *cond) > +{ > + qemu_cond_wait(cond, &qemu_global_mutex); > +} > +
Even though it's trivial, this generic change should probably go in a separate patch from the papr specific pieces. Speaking of which, I think it's sufficiently trivial you could just inline it in the header. > static int all_vcpus_paused(void) > { > CPUState *cpu; > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 05926a3..501dd70 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1556,6 +1556,12 @@ static void ppc_spapr_init(MachineState *machine) > exit(1); > } > spapr->rtas_size = get_image_size(filename); > + > + assert(spapr->rtas_size < RTAS_ERRLOG_OFFSET); > + > + /* Resize blob to accommodate error log. */ > + spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog); > + > spapr->rtas_blob = g_malloc(spapr->rtas_size); > if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) { > error_report("Could not load LPAR rtas '%s'", filename); > diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h > index 9976909..c4d4446 100644 > --- a/include/qemu/main-loop.h > +++ b/include/qemu/main-loop.h > @@ -263,6 +263,14 @@ void qemu_mutex_lock_iothread(void); > */ > void qemu_mutex_unlock_iothread(void); > > +/** > + * qemu_cond_wait_iothread: Wait on condition for the main loop mutex > + * > + * This function atomically releases the main loop mutex and causes > + * the calling thread to block on the condition. > + */ > +void qemu_cond_wait_iothread(QemuCond *cond); > + > /* internal interfaces */ > > void qemu_fd_register(int fd); > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 110436d..2bbb46d 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -1665,6 +1665,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run > *run) > ret = 0; > break; > > + case KVM_EXIT_NMI: > + DPRINTF("handle NMI exception\n"); > + ret = kvm_handle_nmi(cpu); > + break; > + > default: > fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); > ret = -1; > @@ -2484,3 +2489,84 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) > { > return data & 0xffff; > } > + > +int kvm_handle_nmi(PowerPCCPU *cpu) > +{ > + struct RtasMCELog mc_log; > + CPUPPCState *env = &cpu->env; You go from cpu to env here.. > + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); > + target_ulong msr = 0; > + > + cpu_synchronize_state(CPU(ppc_env_get_cpu(env))); Then back again awkwardly, although you still have the cpu variable. > + > + /* > + * Properly set bits in MSR before we invoke the handler. > + * SRR0/1, DAR and DSISR are properly set by KVM > + */ > + if (!(*pcc->interrupts_big_endian)(cpu)) { > + msr |= (1ULL << MSR_LE); > + } > + > + if (env->msr && (1ULL << MSR_SF)) { > + msr |= (1ULL << MSR_SF); > + } > + > + msr |= (1ULL << MSR_ME); Based on earlier discussions, sounds like assert(msr & (1ULL << MSR_ME)) would actually be correct here. > + env->msr = msr; > + > + if (!spapr->guest_machine_check_addr) { > + /* > + * If OS has not registered with "ibm,nmi-register" > + * jump to 0x200 > + */ > + env->nip = 0x200; > + return 0; > + } > + > + while (spapr->mc_in_progress) { > + /* > + * Check whether the same CPU got machine check error > + * while still handling the mc error (i.e., before > + * that CPU called "ibm,nmi-interlock" > + */ > + if (spapr->mc_cpu == cpu->cpu_dt_id) { > + qemu_system_guest_panicked(); > + } > + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); > + } > + spapr->mc_in_progress = true; > + spapr->mc_cpu = cpu->cpu_dt_id; > + > + /* Set error log fields */ > + mc_log.r3 = env->gpr[3]; > + mc_log.err_log.byte0 = 0; > + mc_log.err_log.byte1 = > + (RTAS_SEVERITY_ERROR_SYNC << RTAS_ELOG_SEVERITY_SHIFT); > + mc_log.err_log.byte1 |= > + (RTAS_DISP_NOT_RECOVERED << RTAS_ELOG_DISPOSITION_SHIFT); > + mc_log.err_log.byte2 = > + (RTAS_INITIATOR_MEMORY << RTAS_ELOG_INITIATOR_SHIFT); > + mc_log.err_log.byte2 |= RTAS_TARGET_MEMORY; > + > + if (env->spr[SPR_DSISR] & P7_DSISR_MC_UE) { > + mc_log.err_log.byte3 = RTAS_TYPE_ECC_UNCORR; > + } else { > + mc_log.err_log.byte3 = 0; > + } > + > + /* Handle all Host/Guest LE/BE combinations */ > + if (env->msr & (1ULL << MSR_LE)) { > + mc_log.r3 = cpu_to_le64(mc_log.r3); > + } else { > + mc_log.r3 = cpu_to_be64(mc_log.r3); > + } > + > + cpu_physical_memory_write(spapr->rtas_addr + RTAS_ERRLOG_OFFSET, > + &mc_log, sizeof(mc_log)); > + > + env->nip = spapr->guest_machine_check_addr; > + env->gpr[3] = spapr->rtas_addr + RTAS_ERRLOG_OFFSET; > + > + return 0; > +} > diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h > index 5c1d334..ea3345b 100644 > --- a/target-ppc/kvm_ppc.h > +++ b/target-ppc/kvm_ppc.h > @@ -53,6 +53,87 @@ void kvmppc_hash64_free_pteg(uint64_t token); > void kvmppc_hash64_write_pte(CPUPPCState *env, target_ulong pte_index, > target_ulong pte0, target_ulong pte1); > bool kvmppc_has_cap_fixup_hcalls(void); > +int kvm_handle_nmi(PowerPCCPU *cpu); > + > +/* Offset from rtas-base where error log is placed */ > +#define RTAS_ERRLOG_OFFSET 0x200 > + > +#define RTAS_ELOG_SEVERITY_SHIFT 0x5 > +#define RTAS_ELOG_DISPOSITION_SHIFT 0x3 > +#define RTAS_ELOG_INITIATOR_SHIFT 0x4 > + > +/* > + * Only required RTAS event severity, disposition, initiator > + * target and type are copied from arch/powerpc/include/asm/rtas.h > + */ > + > +/* RTAS event severity */ > +#define RTAS_SEVERITY_ERROR_SYNC 0x3 > + > +/* RTAS event disposition */ > +#define RTAS_DISP_NOT_RECOVERED 0x2 > + > +/* RTAS event initiator */ > +#define RTAS_INITIATOR_MEMORY 0x4 > + > +/* RTAS event target */ > +#define RTAS_TARGET_MEMORY 0x4 > + > +/* RTAS event type */ > +#define RTAS_TYPE_ECC_UNCORR 0x09 > + > +/* > + * Currently KVM only passes on the uncorrected machine > + * check memory error to guest. Other machine check errors > + * such as SLB multi-hit and TLB multi-hit are recovered > + * in KVM and are not passed on to guest. > + * > + * DSISR Bit for uncorrected machine check error. Based > + * on arch/powerpc/include/asm/mce.h > + */ > +#define PPC_BIT(bit) (0x8000000000000000ULL >> bit) > +#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ > + > +/* Adopted from kernel source arch/powerpc/include/asm/rtas.h */ > +struct rtas_error_log { > + /* Byte 0 */ > + uint8_t byte0; /* Architectural version */ > + > + /* Byte 1 */ > + uint8_t byte1; > + /* XXXXXXXX > + * XXX 3: Severity level of error > + * XX 2: Degree of recovery > + * X 1: Extended log present? > + * XX 2: Reserved > + */ > + > + /* Byte 2 */ > + uint8_t byte2; > + /* XXXXXXXX > + * XXXX 4: Initiator of event > + * XXXX 4: Target of failed operation > + */ > + uint8_t byte3; /* General event or error*/ > + __be32 extended_log_length; /* length in bytes */ > + unsigned char buffer[1]; /* Start of extended log */ > + /* Variable length. */ > +}; > + > +/* > + * Data format in RTAS-Blob > + * > + * This structure contains error information related to Machine > + * Check exception. This is filled up and copied to rtas-blob > + * upon machine check exception. The address of rtas-blob is > + * passed on to OS registered machine check notification > + * routines upon machine check exception > + */ > +struct RtasMCELog { > + target_ulong r3; > + struct rtas_error_log err_log; > +}; > + > > #else > > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
signature.asc
Description: PGP signature