Hi Alex,
On 2/20/25 11:48 PM, Alex Williamson wrote: > The memory and IO BARs for devices are only accessible in the D0 > power state. In other power states the PCI spec defines that the > device should respond to TLPs and messages with an Unsupported > Request response. The closest we can come to emulating this > behavior is to consider the BARs as unmapped, removing them from > the address space. > > Introduce an interface to register the PM capability such that > the device power state is considered relative to the BAR mapping > state. "the device power state is considered relative to the BAR mapping state." I rather understood that you want the BAR mapping state to depend on the power state but maybe I misunderstand the langage here. > > Cc: Michael S. Tsirkin <m...@redhat.com> > Cc: Marcel Apfelbaum <marcel.apfelb...@gmail.com> > Signed-off-by: Alex Williamson <alex.william...@redhat.com> > --- > hw/pci/pci.c | 83 ++++++++++++++++++++++++++++++++++++- > hw/pci/trace-events | 2 + > include/hw/pci/pci.h | 3 ++ > include/hw/pci/pci_device.h | 3 ++ > 4 files changed, 89 insertions(+), 2 deletions(-) > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index 2afa423925c5..4f2554dd63ac 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -435,6 +435,74 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage > msg) > attrs, NULL); > } > > +int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp) > +{ > + int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, > errp); > + > + if (cap < 0) { > + return cap; > + } > + > + d->pm_cap = cap; > + d->cap_present |= QEMU_PCI_CAP_PM; > + pci_set_word(d->wmask + cap + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK); > + > + return cap; > +} > + > +static uint8_t pci_pm_state(PCIDevice *d) > +{ > + uint16_t pmcsr; > + > + if (!(d->cap_present & QEMU_PCI_CAP_PM)) { > + return 0; > + } > + > + pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL); > + > + return pmcsr & PCI_PM_CTRL_STATE_MASK; > +} > + > +static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old) the old/new terminology sounds weird to me. generally when one updates we pass the new value. > +{ > + uint16_t pmc; > + uint8_t new; > + > + if (!(d->cap_present & QEMU_PCI_CAP_PM) || > + !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) { > + return old; > + } > + > + new = pci_pm_state(d); and new sounds to be the current state. pci_pm_update() does a kind of validation of the current value? > + if (new == old) { > + return old; > + } > + > + pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC); > + > + /* > + * Transitions to D1 & D2 are only allowed if supported. Devices may > + * only transition to higher D-states or to D0. The write is dropped > + * for rejected transitions by restoring the old state. > + */ > + if ((!(pmc & PCI_PM_CAP_D1) && new == 1) || > + (!(pmc & PCI_PM_CAP_D2) && new == 2) || > + (old && new && new < old)) { > + pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL, > + PCI_PM_CTRL_STATE_MASK); > + pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL, > + old); > + trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d), > + PCI_SLOT(d->devfn), PCI_FUNC(d->devfn), > + old, new); the trace does not output that the old state is kept. This may be helpful. > + return old; > + } > + > + trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn), > + PCI_FUNC(d->devfn), old, new); > + return new; > +} > + > static void pci_reset_regions(PCIDevice *dev) > { > int r; > @@ -474,6 +542,11 @@ static void pci_do_device_reset(PCIDevice *dev) > pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | > pci_get_word(dev->w1cmask + > PCI_INTERRUPT_LINE)); > dev->config[PCI_CACHE_LINE_SIZE] = 0x0; > + /* Default PM state is D0 */ > + if (dev->cap_present & QEMU_PCI_CAP_PM) { > + pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL, > + PCI_PM_CTRL_STATE_MASK); > + } > pci_reset_regions(dev); > pci_update_mappings(dev); > > @@ -1598,7 +1671,7 @@ static void pci_update_mappings(PCIDevice *d) > continue; > > new_addr = pci_bar_address(d, i, r->type, r->size); > - if (!d->enabled) { > + if (!d->enabled || pci_pm_state(d)) { > new_addr = PCI_BAR_UNMAPPED; > } > > @@ -1664,6 +1737,7 @@ uint32_t pci_default_read_config(PCIDevice *d, > > void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, > int l) > { > + uint8_t new_pm_state, old_pm_state = pci_pm_state(d); > int i, was_irq_disabled = pci_irq_disabled(d); > uint32_t val = val_in; > > @@ -1676,11 +1750,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t > addr, uint32_t val_in, int > d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask); > d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */ > } > + IIUC the config is first updated and then we check the validity and potentially restore the older value. Couldn't we check the validity before updating d->config? > + new_pm_state = pci_pm_update(d, addr, l, old_pm_state); > + > if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) || > ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) || > ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) || > - range_covers_byte(addr, l, PCI_COMMAND)) > + range_covers_byte(addr, l, PCI_COMMAND) || > + !!new_pm_state != !!old_pm_state) { > pci_update_mappings(d); Eric > + } > > if (ranges_overlap(addr, l, PCI_COMMAND, 2)) { > pci_update_irq_disabled(d, was_irq_disabled); > diff --git a/hw/pci/trace-events b/hw/pci/trace-events > index 19643aa8c6b0..811354f29031 100644 > --- a/hw/pci/trace-events > +++ b/hw/pci/trace-events > @@ -1,6 +1,8 @@ > # See docs/devel/tracing.rst for syntax documentation. > > # pci.c > +pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t > func, uint8_t old, uint8_t new) "%s %02x:%02x.%x bad PM transition D%d->D%d" > +pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t > func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d" > pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, > uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x > %d,0x%"PRIx64"+0x%"PRIx64 > pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, > uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x > %d,0x%"PRIx64"+0x%"PRIx64 > pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char > *parent_path) "IRQ %d @%s -> IRQ %d @%s" > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h > index 4002bbeebde0..c220cc844962 100644 > --- a/include/hw/pci/pci.h > +++ b/include/hw/pci/pci.h > @@ -216,6 +216,8 @@ enum { > QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR), > #define QEMU_PCIE_EXT_TAG_BITNR 13 > QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR), > +#define QEMU_PCI_CAP_PM_BITNR 14 > + QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR), > }; > > typedef struct PCIINTxRoute { > @@ -676,5 +678,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev) > MSIMessage pci_get_msi_message(PCIDevice *dev, int vector); > void pci_set_enabled(PCIDevice *pci_dev, bool state); > void pci_set_power(PCIDevice *pci_dev, bool state); > +int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp); > > #endif > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h > index add208edfabd..345b12eaac1a 100644 > --- a/include/hw/pci/pci_device.h > +++ b/include/hw/pci/pci_device.h > @@ -105,6 +105,9 @@ struct PCIDevice { > /* Capability bits */ > uint32_t cap_present; > > + /* Offset of PM capability in config space */ > + uint8_t pm_cap; > + > /* Offset of MSI-X capability in config space */ > uint8_t msix_cap; >