On Thu, Oct 24, 2019 at 01:13:06PM +0530, Ganesh Goudar wrote: > From: Aravinda Prasad <arawind...@gmail.com> > > This patch includes migration support for machine check > handling. Especially this patch blocks VM migration > requests until the machine check error handling is > complete as these errors are specific to the source > hardware and is irrelevant on the target hardware. > > [Do not set FWNMI cap in post_load, now its done in .apply hook] > Signed-off-by: Ganesh Goudar <ganes...@linux.ibm.com> > Signed-off-by: Aravinda Prasad <arawind...@gmail.com> > --- > hw/ppc/spapr.c | 41 +++++++++++++++++++++++++++++++++++++++++ > hw/ppc/spapr_events.c | 16 +++++++++++++++- > hw/ppc/spapr_rtas.c | 2 ++ > include/hw/ppc/spapr.h | 2 ++ > 4 files changed, 60 insertions(+), 1 deletion(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 346ec5ba6c..e0d0f95ec0 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -46,6 +46,7 @@ > #include "migration/qemu-file-types.h" > #include "migration/global_state.h" > #include "migration/register.h" > +#include "migration/blocker.h" > #include "mmu-hash64.h" > #include "mmu-book3s-v3.h" > #include "cpu-models.h" > @@ -1751,6 +1752,8 @@ static void spapr_machine_reset(MachineState *machine) > > /* Signal all vCPUs waiting on this condition */ > qemu_cond_broadcast(&spapr->mc_delivery_cond); > + > + migrate_del_blocker(spapr->fwnmi_migration_blocker); > } > > static void spapr_create_nvram(SpaprMachineState *spapr) > @@ -2041,6 +2044,43 @@ static const VMStateDescription vmstate_spapr_dtb = { > }, > }; > > +static bool spapr_fwnmi_needed(void *opaque) > +{ > + SpaprMachineState *spapr = (SpaprMachineState *)opaque; > + > + return spapr->guest_machine_check_addr != -1; > +} > + > +static int spapr_fwnmi_pre_save(void *opaque) > +{ > + SpaprMachineState *spapr = (SpaprMachineState *)opaque; > + > + /* > + * With -only-migratable QEMU option, we cannot block migration. > + * Hence check if machine check handling is in progress and print > + * a warning message. > + */
IIUC the logic below this could also occur in the case where the fwnmi event occurs after a migration has started, but before it completes, not just with -only-migratable. Is that correct? > + if (spapr->mc_status != -1) { > + warn_report("A machine check is being handled during migration. The" > + "handler may run and log hardware error on the destination"); > + } > + > + return 0; > +} > + > +static const VMStateDescription vmstate_spapr_machine_check = { > + .name = "spapr_machine_check", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = spapr_fwnmi_needed, > + .pre_save = spapr_fwnmi_pre_save, > + .fields = (VMStateField[]) { > + VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState), > + VMSTATE_INT32(mc_status, SpaprMachineState), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > static const VMStateDescription vmstate_spapr = { > .name = "spapr", > .version_id = 3, > @@ -2075,6 +2115,7 @@ static const VMStateDescription vmstate_spapr = { > &vmstate_spapr_cap_large_decr, > &vmstate_spapr_cap_ccf_assist, > &vmstate_spapr_cap_fwnmi, > + &vmstate_spapr_machine_check, > NULL > } > }; > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c > index db44e09154..30d9371c88 100644 > --- a/hw/ppc/spapr_events.c > +++ b/hw/ppc/spapr_events.c > @@ -43,6 +43,7 @@ > #include "qemu/main-loop.h" > #include "hw/ppc/spapr_ovec.h" > #include <libfdt.h> > +#include "migration/blocker.h" > > #define RTAS_LOG_VERSION_MASK 0xff000000 > #define RTAS_LOG_VERSION_6 0x06000000 > @@ -842,6 +843,8 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > { > SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); > CPUState *cs = CPU(cpu); > + int ret; > + Error *local_err = NULL; > > if (spapr->guest_machine_check_addr == -1) { > /* > @@ -871,8 +874,19 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered) > return; > } > } > - spapr->mc_status = cpu->vcpu_id; > > + ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err); > + if (ret == -EBUSY) { > + /* > + * We don't want to abort so we let the migration to continue. > + * In a rare case, the machine check handler will run on the target. > + * Though this is not preferable, it is better than aborting > + * the migration or killing the VM. > + */ > + warn_report_err(local_err); I suspect the error message in local_err won't be particularly meaningful on its own. Perhaps you need to add a prefix to clarify that the problem is you've received a fwnmi after migration has commenced? > + } > + > + spapr->mc_status = cpu->vcpu_id; > spapr_mce_dispatch_elog(cpu, recovered); > } > > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c > index 0328b1f341..c78d96ee7e 100644 > --- a/hw/ppc/spapr_rtas.c > +++ b/hw/ppc/spapr_rtas.c > @@ -50,6 +50,7 @@ > #include "hw/ppc/fdt.h" > #include "target/ppc/mmu-hash64.h" > #include "target/ppc/mmu-book3s-v3.h" > +#include "migration/blocker.h" > > static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr, > uint32_t token, uint32_t nargs, > @@ -446,6 +447,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu, > */ > spapr->mc_status = -1; > qemu_cond_signal(&spapr->mc_delivery_cond); > + migrate_del_blocker(spapr->fwnmi_migration_blocker); Oh... damn. I suggested using a static fwnmi_migration_blocker instance, but I just realized there's a problem with it. If we do receive multiple fwnmi events on different cpus at roughly the same time, this will break: I think we'll try to add the same migration blocker instance multiple times, which won't be good. Even if that doesn't do anything *too* bad, then we'll unblock the migration on the first interlock, rather than waiting for all pending fwnmi events to complete. > rtas_st(rets, 0, RTAS_OUT_SUCCESS); > } > > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 86f0fc8fdd..290abd6328 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -215,6 +215,8 @@ struct SpaprMachineState { > > unsigned gpu_numa_id; > SpaprTpmProxy *tpm_proxy; > + > + Error *fwnmi_migration_blocker; > }; > > #define H_SUCCESS 0 -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
signature.asc
Description: PGP signature