On Thursday 17 December 2015 09:30 AM, David Gibson wrote: > On Wed, Dec 16, 2015 at 11:38:37AM +0530, Aravinda Prasad wrote: >> Memory error such as bit flips that cannot be corrected >> by hardware are passed on to the kernel for handling. >> If the memory address in error belongs to guest then >> guest kernel is responsible for taking suitable action. >> Patch [1] enhances KVM to exit guest with exit reason >> set to KVM_EXIT_NMI in such cases. >> >> This patch handles KVM_EXIT_NMI exit. If the guest OS >> has registered the machine check handling routine by >> calling "ibm,nmi-register", then the handler builds >> the error log and invokes the registered handler else >> invokes the handler at 0x200. >> >> [1] http://marc.info/?l=kvm&m=145024538523497 >> >> Signed-off-by: Aravinda Prasad <aravi...@linux.vnet.ibm.com> >> --- >> cpus.c | 5 +++ >> hw/ppc/spapr.c | 6 +++ >> include/qemu/main-loop.h | 8 ++++ >> target-ppc/kvm.c | 86 >> ++++++++++++++++++++++++++++++++++++++++++++++ >> target-ppc/kvm_ppc.h | 81 +++++++++++++++++++++++++++++++++++++++++++ >> 5 files changed, 186 insertions(+) >> >> diff --git a/cpus.c b/cpus.c >> index dddd056..7b7dd0f 100644 >> --- a/cpus.c >> +++ b/cpus.c >> @@ -1154,6 +1154,11 @@ void qemu_mutex_unlock_iothread(void) >> qemu_mutex_unlock(&qemu_global_mutex); >> } >> >> +void qemu_cond_wait_iothread(QemuCond *cond) >> +{ >> + qemu_cond_wait(cond, &qemu_global_mutex); >> +} >> + > > Even though it's trivial, this generic change should probably go in a > separate patch from the papr specific pieces. > > Speaking of which, I think it's sufficiently trivial you could just > inline it in the header.
I had it in a separate patch before. I changed my mind later as it is trivial. I will include it in a separate patch. > >> static int all_vcpus_paused(void) >> { >> CPUState *cpu; >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c >> index 05926a3..501dd70 100644 >> --- a/hw/ppc/spapr.c >> +++ b/hw/ppc/spapr.c >> @@ -1556,6 +1556,12 @@ static void ppc_spapr_init(MachineState *machine) >> exit(1); >> } >> spapr->rtas_size = get_image_size(filename); >> + >> + assert(spapr->rtas_size < RTAS_ERRLOG_OFFSET); >> + >> + /* Resize blob to accommodate error log. */ >> + spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog); >> + >> spapr->rtas_blob = g_malloc(spapr->rtas_size); >> if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) { >> error_report("Could not load LPAR rtas '%s'", filename); >> diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h >> index 9976909..c4d4446 100644 >> --- a/include/qemu/main-loop.h >> +++ b/include/qemu/main-loop.h >> @@ -263,6 +263,14 @@ void qemu_mutex_lock_iothread(void); >> */ >> void qemu_mutex_unlock_iothread(void); >> >> +/** >> + * qemu_cond_wait_iothread: Wait on condition for the main loop mutex >> + * >> + * This function atomically releases the main loop mutex and causes >> + * the calling thread to block on the condition. >> + */ >> +void qemu_cond_wait_iothread(QemuCond *cond); >> + >> /* internal interfaces */ >> >> void qemu_fd_register(int fd); >> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c >> index 110436d..2bbb46d 100644 >> --- a/target-ppc/kvm.c >> +++ b/target-ppc/kvm.c >> @@ -1665,6 +1665,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run >> *run) >> ret = 0; >> break; >> >> + case KVM_EXIT_NMI: >> + DPRINTF("handle NMI exception\n"); >> + ret = kvm_handle_nmi(cpu); >> + break; >> + >> default: >> fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); >> ret = -1; >> @@ -2484,3 +2489,84 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) >> { >> return data & 0xffff; >> } >> + >> +int kvm_handle_nmi(PowerPCCPU *cpu) >> +{ >> + struct RtasMCELog mc_log; >> + CPUPPCState *env = &cpu->env; > > You go from cpu to env here.. > >> + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); >> + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu); >> + target_ulong msr = 0; >> + >> + cpu_synchronize_state(CPU(ppc_env_get_cpu(env))); > > Then back again awkwardly, although you still have the cpu variable. ah.. I overlooked it. > >> + >> + /* >> + * Properly set bits in MSR before we invoke the handler. >> + * SRR0/1, DAR and DSISR are properly set by KVM >> + */ >> + if (!(*pcc->interrupts_big_endian)(cpu)) { >> + msr |= (1ULL << MSR_LE); >> + } >> + >> + if (env->msr && (1ULL << MSR_SF)) { >> + msr |= (1ULL << MSR_SF); >> + } >> + >> + msr |= (1ULL << MSR_ME); > > Based on earlier discussions, sounds like assert(msr & (1ULL << > MSR_ME)) would actually be correct here. Based on http://lists.nongnu.org/archive/html/qemu-ppc/2015-11/msg00306.html, I always set MSR_ME and don't assert if not set. Or am I missing anything here? Regards, Aravinda > >> + env->msr = msr; >> + >> + if (!spapr->guest_machine_check_addr) { >> + /* >> + * If OS has not registered with "ibm,nmi-register" >> + * jump to 0x200 >> + */ >> + env->nip = 0x200; >> + return 0; >> + } >> + >> + while (spapr->mc_in_progress) { >> + /* >> + * Check whether the same CPU got machine check error >> + * while still handling the mc error (i.e., before >> + * that CPU called "ibm,nmi-interlock" >> + */ >> + if (spapr->mc_cpu == cpu->cpu_dt_id) { >> + qemu_system_guest_panicked(); >> + } >> + qemu_cond_wait_iothread(&spapr->mc_delivery_cond); >> + } >> + spapr->mc_in_progress = true; >> + spapr->mc_cpu = cpu->cpu_dt_id; >> + >> + /* Set error log fields */ >> + mc_log.r3 = env->gpr[3]; >> + mc_log.err_log.byte0 = 0; >> + mc_log.err_log.byte1 = >> + (RTAS_SEVERITY_ERROR_SYNC << RTAS_ELOG_SEVERITY_SHIFT); >> + mc_log.err_log.byte1 |= >> + (RTAS_DISP_NOT_RECOVERED << RTAS_ELOG_DISPOSITION_SHIFT); >> + mc_log.err_log.byte2 = >> + (RTAS_INITIATOR_MEMORY << RTAS_ELOG_INITIATOR_SHIFT); >> + mc_log.err_log.byte2 |= RTAS_TARGET_MEMORY; >> + >> + if (env->spr[SPR_DSISR] & P7_DSISR_MC_UE) { >> + mc_log.err_log.byte3 = RTAS_TYPE_ECC_UNCORR; >> + } else { >> + mc_log.err_log.byte3 = 0; >> + } >> + >> + /* Handle all Host/Guest LE/BE combinations */ >> + if (env->msr & (1ULL << MSR_LE)) { >> + mc_log.r3 = cpu_to_le64(mc_log.r3); >> + } else { >> + mc_log.r3 = cpu_to_be64(mc_log.r3); >> + } >> + >> + cpu_physical_memory_write(spapr->rtas_addr + RTAS_ERRLOG_OFFSET, >> + &mc_log, sizeof(mc_log)); >> + >> + env->nip = spapr->guest_machine_check_addr; >> + env->gpr[3] = spapr->rtas_addr + RTAS_ERRLOG_OFFSET; >> + >> + return 0; >> +} >> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h >> index 5c1d334..ea3345b 100644 >> --- a/target-ppc/kvm_ppc.h >> +++ b/target-ppc/kvm_ppc.h >> @@ -53,6 +53,87 @@ void kvmppc_hash64_free_pteg(uint64_t token); >> void kvmppc_hash64_write_pte(CPUPPCState *env, target_ulong pte_index, >> target_ulong pte0, target_ulong pte1); >> bool kvmppc_has_cap_fixup_hcalls(void); >> +int kvm_handle_nmi(PowerPCCPU *cpu); >> + >> +/* Offset from rtas-base where error log is placed */ >> +#define RTAS_ERRLOG_OFFSET 0x200 >> + >> +#define RTAS_ELOG_SEVERITY_SHIFT 0x5 >> +#define RTAS_ELOG_DISPOSITION_SHIFT 0x3 >> +#define RTAS_ELOG_INITIATOR_SHIFT 0x4 >> + >> +/* >> + * Only required RTAS event severity, disposition, initiator >> + * target and type are copied from arch/powerpc/include/asm/rtas.h >> + */ >> + >> +/* RTAS event severity */ >> +#define RTAS_SEVERITY_ERROR_SYNC 0x3 >> + >> +/* RTAS event disposition */ >> +#define RTAS_DISP_NOT_RECOVERED 0x2 >> + >> +/* RTAS event initiator */ >> +#define RTAS_INITIATOR_MEMORY 0x4 >> + >> +/* RTAS event target */ >> +#define RTAS_TARGET_MEMORY 0x4 >> + >> +/* RTAS event type */ >> +#define RTAS_TYPE_ECC_UNCORR 0x09 >> + >> +/* >> + * Currently KVM only passes on the uncorrected machine >> + * check memory error to guest. Other machine check errors >> + * such as SLB multi-hit and TLB multi-hit are recovered >> + * in KVM and are not passed on to guest. >> + * >> + * DSISR Bit for uncorrected machine check error. Based >> + * on arch/powerpc/include/asm/mce.h >> + */ >> +#define PPC_BIT(bit) (0x8000000000000000ULL >> bit) >> +#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ >> + >> +/* Adopted from kernel source arch/powerpc/include/asm/rtas.h */ >> +struct rtas_error_log { >> + /* Byte 0 */ >> + uint8_t byte0; /* Architectural version */ >> + >> + /* Byte 1 */ >> + uint8_t byte1; >> + /* XXXXXXXX >> + * XXX 3: Severity level of error >> + * XX 2: Degree of recovery >> + * X 1: Extended log present? >> + * XX 2: Reserved >> + */ >> + >> + /* Byte 2 */ >> + uint8_t byte2; >> + /* XXXXXXXX >> + * XXXX 4: Initiator of event >> + * XXXX 4: Target of failed operation >> + */ >> + uint8_t byte3; /* General event or error*/ >> + __be32 extended_log_length; /* length in bytes */ >> + unsigned char buffer[1]; /* Start of extended log */ >> + /* Variable length. */ >> +}; >> + >> +/* >> + * Data format in RTAS-Blob >> + * >> + * This structure contains error information related to Machine >> + * Check exception. This is filled up and copied to rtas-blob >> + * upon machine check exception. The address of rtas-blob is >> + * passed on to OS registered machine check notification >> + * routines upon machine check exception >> + */ >> +struct RtasMCELog { >> + target_ulong r3; >> + struct rtas_error_log err_log; >> +}; >> + >> >> #else >> >> > -- Regards, Aravinda