On 03/31/2014 06:52 PM, Alexey Kardashevskiy wrote: > On Sat, Aug 3, 2013 at 2:30 AM, Vincenzo Maffione <v.maffi...@gmail.com>wrote: > >> This patch partially implements the e1000 interrupt mitigation mechanisms. >> Using a single QEMUTimer, it emulates the ITR register (which is the newer >> mitigation register, recommended by Intel) and approximately emulates >> RADV and TADV registers. TIDV and RDTR register functionalities are not >> emulated (RDTR is only used to validate RADV, according to the e1000 >> specs). >> > > > I have been debugging e1000 today and discovered that e1000 does not > survive migration.
Sorry, false alarm :) The problem is in ppc64-kvm and not e1000-specific (but somehow it is a lot easier to trigger by using e1000). > The systems are PPC64 (POWER7), machine names are vpl1, vpl2, the guest > name is aiktest0. > QEMU command line is: > /home/aik/qemu-system-ppc64 \ > -enable-kvm \ > -m 1024 \ > -machine pseries \ > -nographic \ > -vga none \ > -initrd 1.cpio \ > -kernel vml310 \ > -netdev > tap,id=id0,ifname=tap-testmig,script=ifup.sh,downscript=ifdown.sh \ > -device e1000,id=id1,netdev=id0,mitigation=off,mac=C0:41:49:4b:00:00 \ > -incoming tcp:vpl2:4000 \ > > What I do: > 1. Run a guest. > 2. Do dhclient > 3. Start "ping -f vpl2" in the guest. > 4. Start "ping -f aiktest0" on vpl1 (once) and vpl2 (twice, to create > enough load). > 5. Migrate. > > Normally after migration, s->mit_irq_level is zero at the exit from > e1000_post_load() and destination guest continues working fine. However > sometime it is (s->mit_irq_level == 1). If this is the case, the device > will stay dead and will not recover, periodically printing weird messages > and trying to reset the adapter. RDH becomes equal to RDT, e1000_has_rxbufs > returns false and that's it. > > Running both guests with -device e1000,...mitigation=off helps. > > Any idea how to fix this properly? Thanks. > > > >> >> RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation >> mechanism and would need a timer each to be completely emulated. However, >> a single timer has been used in order to reach a good compromise between >> emulation accuracy and simplicity/efficiency. >> >> The implemented mechanism can be enabled/disabled specifying the command >> line e1000-specific boolean parameter "mitigation", e.g. >> >> qemu-system-x86_64 -device e1000,mitigation=on,... ... >> >> For more information, see the Software developer's manual at >> http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf. >> >> Interrupt mitigation boosts performance when the guest suffers from >> an high interrupt rate (i.e. receiving short UDP packets at high packet >> rate). For some numerical results see the following link >> http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf >> >> Signed-off-by: Vincenzo Maffione <v.maffi...@gmail.com> >> --- >> Added pc-*-1.7 machines (default machine moved to pc-i440fx-1.7). >> >> hw/i386/pc_piix.c | 18 ++++++- >> hw/i386/pc_q35.c | 16 ++++++- >> hw/net/e1000.c | 131 >> +++++++++++++++++++++++++++++++++++++++++++++++++-- >> include/hw/i386/pc.h | 8 ++++ >> 4 files changed, 167 insertions(+), 6 deletions(-) >> >> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c >> index ab25458..f5183d4 100644 >> --- a/hw/i386/pc_piix.c >> +++ b/hw/i386/pc_piix.c >> @@ -336,8 +336,8 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args) >> } >> #endif >> >> -static QEMUMachine pc_i440fx_machine_v1_6 = { >> - .name = "pc-i440fx-1.6", >> +static QEMUMachine pc_i440fx_machine_v1_7 = { >> + .name = "pc-i440fx-1.7", >> .alias = "pc", >> .desc = "Standard PC (i440FX + PIIX, 1996)", >> .init = pc_init_pci, >> @@ -347,6 +347,19 @@ static QEMUMachine pc_i440fx_machine_v1_6 = { >> DEFAULT_MACHINE_OPTIONS, >> }; >> >> +static QEMUMachine pc_i440fx_machine_v1_6 = { >> + .name = "pc-i440fx-1.6", >> + .desc = "Standard PC (i440FX + PIIX, 1996)", >> + .init = pc_init_pci, >> + .hot_add_cpu = pc_hot_add_cpu, >> + .max_cpus = 255, >> + .compat_props = (GlobalProperty[]) { >> + PC_COMPAT_1_6, >> + { /* end of list */ } >> + }, >> + DEFAULT_MACHINE_OPTIONS, >> +}; >> + >> static QEMUMachine pc_i440fx_machine_v1_5 = { >> .name = "pc-i440fx-1.5", >> .desc = "Standard PC (i440FX + PIIX, 1996)", >> @@ -769,6 +782,7 @@ static QEMUMachine xenfv_machine = { >> >> static void pc_machine_init(void) >> { >> + qemu_register_machine(&pc_i440fx_machine_v1_7); >> qemu_register_machine(&pc_i440fx_machine_v1_6); >> qemu_register_machine(&pc_i440fx_machine_v1_5); >> qemu_register_machine(&pc_i440fx_machine_v1_4); >> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c >> index 2f35d12..09b0a54 100644 >> --- a/hw/i386/pc_q35.c >> +++ b/hw/i386/pc_q35.c >> @@ -230,13 +230,26 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs >> *args) >> pc_q35_init_1_5(args); >> } >> >> +static QEMUMachine pc_q35_machine_v1_7 = { >> + .name = "pc-q35-1.7", >> + .alias = "q35", >> + .desc = "Standard PC (Q35 + ICH9, 2009)", >> + .init = pc_q35_init, >> + .hot_add_cpu = pc_hot_add_cpu, >> + .max_cpus = 255, >> + DEFAULT_MACHINE_OPTIONS, >> +}; >> + >> static QEMUMachine pc_q35_machine_v1_6 = { >> .name = "pc-q35-1.6", >> - .alias = "q35", >> .desc = "Standard PC (Q35 + ICH9, 2009)", >> .init = pc_q35_init, >> .hot_add_cpu = pc_hot_add_cpu, >> .max_cpus = 255, >> + .compat_props = (GlobalProperty[]) { >> + PC_COMPAT_1_6, >> + { /* end of list */ } >> + }, >> DEFAULT_MACHINE_OPTIONS, >> }; >> >> @@ -267,6 +280,7 @@ static QEMUMachine pc_q35_machine_v1_4 = { >> >> static void pc_q35_machine_init(void) >> { >> + qemu_register_machine(&pc_q35_machine_v1_7); >> qemu_register_machine(&pc_q35_machine_v1_6); >> qemu_register_machine(&pc_q35_machine_v1_5); >> qemu_register_machine(&pc_q35_machine_v1_4); >> diff --git a/hw/net/e1000.c b/hw/net/e1000.c >> index fdb1f89..32014c2 100644 >> --- a/hw/net/e1000.c >> +++ b/hw/net/e1000.c >> @@ -135,9 +135,16 @@ typedef struct E1000State_st { >> >> QEMUTimer *autoneg_timer; >> >> + QEMUTimer *mit_timer; /* Mitigation timer. */ >> + bool mit_timer_on; /* Mitigation timer is running. */ >> + bool mit_irq_level; /* Tracks interrupt pin level. */ >> + uint32_t mit_ide; /* Tracks E1000_TXD_CMD_IDE bit. */ >> + >> /* Compatibility flags for migration to/from qemu 1.3.0 and older */ >> #define E1000_FLAG_AUTONEG_BIT 0 >> +#define E1000_FLAG_MIT_BIT 1 >> #define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT) >> +#define E1000_FLAG_MIT (1 << E1000_FLAG_MIT_BIT) >> uint32_t compat_flags; >> } E1000State; >> >> @@ -158,7 +165,8 @@ enum { >> defreg(TORH), defreg(TORL), defreg(TOTH), defreg(TOTL), >> defreg(TPR), defreg(TPT), defreg(TXDCTL), defreg(WUFC), >> defreg(RA), defreg(MTA), >> defreg(CRCERRS),defreg(VFTA), >> - defreg(VET), >> + defreg(VET), defreg(RDTR), defreg(RADV), defreg(TADV), >> + defreg(ITR), >> }; >> >> static void >> @@ -245,10 +253,21 @@ static const uint32_t mac_reg_init[] = { >> E1000_MANC_RMCP_EN, >> }; >> >> +/* Helper function, *curr == 0 means the value is not set */ >> +static inline void >> +mit_update_delay(uint32_t *curr, uint32_t value) >> +{ >> + if (value && (*curr == 0 || value < *curr)) { >> + *curr = value; >> + } >> +} >> + >> static void >> set_interrupt_cause(E1000State *s, int index, uint32_t val) >> { >> PCIDevice *d = PCI_DEVICE(s); >> + uint32_t pending_ints; >> + uint32_t mit_delay; >> >> if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { >> /* Only for 8257x */ >> @@ -266,7 +285,57 @@ set_interrupt_cause(E1000State *s, int index, >> uint32_t val) >> */ >> s->mac_reg[ICS] = val; >> >> - qemu_set_irq(d->irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0); >> + pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]); >> + if (!s->mit_irq_level && pending_ints) { >> + /* >> + * Here we detect a potential raising edge. We postpone raising >> the >> + * interrupt line if we are inside the mitigation delay window >> + * (s->mit_timer_on == 1). >> + * We provide a partial implementation of interrupt mitigation, >> + * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units >> for >> + * RADV and TADV, 256ns units for ITR). RDTR is only used to >> enable >> + * RADV; relative timers based on TIDV and RDTR are not >> implemented. >> + */ >> + if (s->mit_timer_on) { >> + return; >> + } >> + if (s->compat_flags & E1000_FLAG_MIT) { >> + /* Compute the next mitigation delay according to pending >> + * interrupts and the current values of RADV (provided >> + * RDTR!=0), TADV and ITR. >> + * Then rearm the timer. >> + */ >> + mit_delay = 0; >> + if (s->mit_ide && >> + (pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) { >> + mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4); >> + } >> + if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) { >> + mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4); >> + } >> + mit_update_delay(&mit_delay, s->mac_reg[ITR]); >> + >> + if (mit_delay) { >> + s->mit_timer_on = 1; >> + qemu_mod_timer(s->mit_timer, >> + qemu_get_clock_ns(vm_clock) + mit_delay * 256); >> + } >> + s->mit_ide = 0; >> + } >> + } >> + >> + s->mit_irq_level = (pending_ints != 0); >> + qemu_set_irq(d->irq[0], s->mit_irq_level); >> +} >> + >> +static void >> +e1000_mit_timer(void *opaque) >> +{ >> + E1000State *s = opaque; >> + >> + s->mit_timer_on = 0; >> + /* Call set_interrupt_cause to update the irq level (if necessary). */ >> + set_interrupt_cause(s, 0, s->mac_reg[ICR]); >> } >> >> static void >> @@ -307,6 +376,10 @@ static void e1000_reset(void *opaque) >> int i; >> >> qemu_del_timer(d->autoneg_timer); >> + qemu_del_timer(d->mit_timer); >> + d->mit_timer_on = 0; >> + d->mit_irq_level = 0; >> + d->mit_ide = 0; >> memset(d->phy_reg, 0, sizeof d->phy_reg); >> memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); >> memset(d->mac_reg, 0, sizeof d->mac_reg); >> @@ -572,6 +645,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc >> *dp) >> struct e1000_context_desc *xp = (struct e1000_context_desc *)dp; >> struct e1000_tx *tp = &s->tx; >> >> + s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE); >> if (dtype == E1000_TXD_CMD_DEXT) { // context descriptor >> op = le32_to_cpu(xp->cmd_and_length); >> tp->ipcss = xp->lower_setup.ip_fields.ipcss; >> @@ -1047,7 +1121,8 @@ static uint32_t (*macreg_readops[])(E1000State *, >> int) = { >> getreg(TORL), getreg(TOTL), getreg(IMS), getreg(TCTL), >> getreg(RDH), getreg(RDT), getreg(VET), getreg(ICS), >> getreg(TDBAL), getreg(TDBAH), getreg(RDBAH), getreg(RDBAL), >> - getreg(TDLEN), getreg(RDLEN), >> + getreg(TDLEN), getreg(RDLEN), getreg(RDTR), getreg(RADV), >> + getreg(TADV), getreg(ITR), >> >> [TOTH] = mac_read_clr8, [TORH] = mac_read_clr8, [GPRC] = >> mac_read_clr4, >> [GPTC] = mac_read_clr4, [TPR] = mac_read_clr4, [TPT] = >> mac_read_clr4, >> @@ -1069,6 +1144,8 @@ static void (*macreg_writeops[])(E1000State *, int, >> uint32_t) = { >> [TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt, >> [IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr, >> [EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl, >> + [RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit, >> + [ITR] = set_16bit, >> [RA ... RA+31] = &mac_writereg, >> [MTA ... MTA+127] = &mac_writereg, >> [VFTA ... VFTA+127] = &mac_writereg, >> @@ -1150,6 +1227,11 @@ static void e1000_pre_save(void *opaque) >> E1000State *s = opaque; >> NetClientState *nc = qemu_get_queue(s->nic); >> >> + /* If the mitigation timer is active, emulate a timeout now. */ >> + if (s->mit_timer_on) { >> + e1000_mit_timer(s); >> + } >> + >> if (!(s->compat_flags & E1000_FLAG_AUTONEG)) { >> return; >> } >> @@ -1171,6 +1253,14 @@ static int e1000_post_load(void *opaque, int >> version_id) >> E1000State *s = opaque; >> NetClientState *nc = qemu_get_queue(s->nic); >> >> + if (!(s->compat_flags & E1000_FLAG_MIT)) { >> + s->mac_reg[ITR] = s->mac_reg[RDTR] = s->mac_reg[RADV] = >> + s->mac_reg[TADV] = 0; >> + s->mit_irq_level = false; >> + } >> + s->mit_ide = 0; >> + s->mit_timer_on = false; >> + >> /* nc.link_down can't be migrated, so infer link_down according >> * to link status bit in mac_reg[STATUS]. >> * Alternatively, restart link negotiation if it was in progress. */ >> @@ -1190,6 +1280,28 @@ static int e1000_post_load(void *opaque, int >> version_id) >> return 0; >> } >> >> +static bool e1000_mit_state_needed(void *opaque) >> +{ >> + E1000State *s = opaque; >> + >> + return s->compat_flags & E1000_FLAG_MIT; >> +} >> + >> +static const VMStateDescription vmstate_e1000_mit_state = { >> + .name = "e1000/mit_state", >> + .version_id = 1, >> + .minimum_version_id = 1, >> + .minimum_version_id_old = 1, >> + .fields = (VMStateField[]) { >> + VMSTATE_UINT32(mac_reg[RDTR], E1000State), >> + VMSTATE_UINT32(mac_reg[RADV], E1000State), >> + VMSTATE_UINT32(mac_reg[TADV], E1000State), >> + VMSTATE_UINT32(mac_reg[ITR], E1000State), >> + VMSTATE_BOOL(mit_irq_level, E1000State), >> + VMSTATE_END_OF_LIST() >> + } >> +}; >> + >> static const VMStateDescription vmstate_e1000 = { >> .name = "e1000", >> .version_id = 2, >> @@ -1267,6 +1379,14 @@ static const VMStateDescription vmstate_e1000 = { >> VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128), >> VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128), >> VMSTATE_END_OF_LIST() >> + }, >> + .subsections = (VMStateSubsection[]) { >> + { >> + .vmsd = &vmstate_e1000_mit_state, >> + .needed = e1000_mit_state_needed, >> + }, { >> + /* empty */ >> + } >> } >> }; >> >> @@ -1316,6 +1436,8 @@ pci_e1000_uninit(PCIDevice *dev) >> >> qemu_del_timer(d->autoneg_timer); >> qemu_free_timer(d->autoneg_timer); >> + qemu_del_timer(d->mit_timer); >> + qemu_free_timer(d->mit_timer); >> memory_region_destroy(&d->mmio); >> memory_region_destroy(&d->io); >> qemu_del_nic(d->nic); >> @@ -1371,6 +1493,7 @@ static int pci_e1000_init(PCIDevice *pci_dev) >> add_boot_device_path(d->conf.bootindex, dev, "/ethernet-phy@0"); >> >> d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, >> d); >> + d->mit_timer = qemu_new_timer_ns(vm_clock, e1000_mit_timer, d); >> >> return 0; >> } >> @@ -1385,6 +1508,8 @@ static Property e1000_properties[] = { >> DEFINE_NIC_PROPERTIES(E1000State, conf), >> DEFINE_PROP_BIT("autonegotiation", E1000State, >> compat_flags, E1000_FLAG_AUTONEG_BIT, true), >> + DEFINE_PROP_BIT("mitigation", E1000State, >> + compat_flags, E1000_FLAG_MIT_BIT, true), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h >> index 3a0c4e3..894c124 100644 >> --- a/include/hw/i386/pc.h >> +++ b/include/hw/i386/pc.h >> @@ -214,7 +214,15 @@ void pvpanic_init(ISABus *bus); >> >> int e820_add_entry(uint64_t, uint64_t, uint32_t); >> >> +#define PC_COMPAT_1_6 \ >> + {\ >> + .driver = "e1000",\ >> + .property = "mitigation",\ >> + .value = "off",\ >> + } >> + >> #define PC_COMPAT_1_5 \ >> + PC_COMPAT_1_6, \ >> {\ >> .driver = "Conroe-" TYPE_X86_CPU,\ >> .property = "model",\ >> -- >> 1.8.3.4 >> >> >> > > -- Alexey