On Mon, 24 Feb 2025 20:03:56 +0100
Eric Auger <eric.au...@redhat.com> wrote:

> Hi Alex,
> 
> On 2/20/25 11:48 PM, Alex Williamson wrote:
> > The memory and IO BARs for devices are only accessible in the D0
> > power state.  In other power states the PCI spec defines that the
> > device should respond to TLPs and messages with an Unsupported
> > Request response.  The closest we can come to emulating this
> > behavior is to consider the BARs as unmapped, removing them from
> > the address space.
> >
> > Introduce an interface to register the PM capability such that
> > the device power state is considered relative to the BAR mapping
> > state.  
> 
> "the device power state is considered relative to the BAR mapping
> state."
> I rather understood that you want the BAR mapping state to depend on the 
> power state but maybe I misunderstand the langage here.

Yes, for the BAR to be mapped, both the memory enable state of the
command register and the device power state are relevant, they are both
considered.

> > Cc: Michael S. Tsirkin <m...@redhat.com>
> > Cc: Marcel Apfelbaum <marcel.apfelb...@gmail.com>
> > Signed-off-by: Alex Williamson <alex.william...@redhat.com>
> > ---
> >  hw/pci/pci.c                | 83 ++++++++++++++++++++++++++++++++++++-
> >  hw/pci/trace-events         |  2 +
> >  include/hw/pci/pci.h        |  3 ++
> >  include/hw/pci/pci_device.h |  3 ++
> >  4 files changed, 89 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index 2afa423925c5..4f2554dd63ac 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -435,6 +435,74 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage 
> > msg)
> >                           attrs, NULL);
> >  }
> >  
> > +int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp)
> > +{
> > +    int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, 
> > errp);
> > +
> > +    if (cap < 0) {
> > +        return cap;
> > +    }
> > +
> > +    d->pm_cap = cap;
> > +    d->cap_present |= QEMU_PCI_CAP_PM;
> > +    pci_set_word(d->wmask + cap + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
> > +
> > +    return cap;
> > +}
> > +
> > +static uint8_t pci_pm_state(PCIDevice *d)
> > +{
> > +    uint16_t pmcsr;
> > +
> > +    if (!(d->cap_present & QEMU_PCI_CAP_PM)) {
> > +        return 0;
> > +    }
> > +
> > +    pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL);
> > +
> > +    return pmcsr & PCI_PM_CTRL_STATE_MASK;
> > +}
> > +
> > +static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t 
> > old)  
> the old/new terminology sounds weird to me. generally when one updates
> we pass the new value.

The new state lives in config space, the old state is recorded before
config space is updated.  Otherwise we interfere with or duplicate the
update of config space, which seems error prone and unnecessarily
complicated.  pci_update_irq_disable() uses the same strategy.

> > +{
> > +    uint16_t pmc;
> > +    uint8_t new;
> > +
> > +    if (!(d->cap_present & QEMU_PCI_CAP_PM) ||
> > +        !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) {
> > +        return old;
> > +    }
> > +
> > +    new = pci_pm_state(d);  
> and new sounds to be the current state.

The state after the guest config write is first applied, yes.

> pci_pm_update() does a kind of validation of the current value? 

Yes.  The PCI spec indicates that invalid transitions are ignored by
the device.  That's currently all this does, but if a device wanted a
callback to effect some behavior at state change, this would be a
logical place to do that.

> > +    if (new == old) {
> > +        return old;
> > +    }
> > +
> > +    pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC);
> > +
> > +    /*
> > +     * Transitions to D1 & D2 are only allowed if supported.  Devices may
> > +     * only transition to higher D-states or to D0.  The write is dropped
> > +     * for rejected transitions by restoring the old state.
> > +     */
> > +    if ((!(pmc & PCI_PM_CAP_D1) && new == 1) ||
> > +        (!(pmc & PCI_PM_CAP_D2) && new == 2) ||
> > +        (old && new && new < old)) {
> > +        pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL,
> > +                                     PCI_PM_CTRL_STATE_MASK);
> > +        pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL,
> > +                                   old);
> > +        trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d),
> > +                                    PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
> > +                                    old, new);  
> the trace does not output that the old state is kept. This may be helpful.

I was trying to keep the trace log brief.  The trace indicates it's a
"bad" transition versus the trace below indicates a successful
transition.  AIUI, most trace logs require some degree of familiarity
with the call path to fully comprehend and it seemed unnecessary to
print the old value twice to explicitly show it was restored.
Suggestions welcome.

> > +        return old;
> > +    }
> > +
> > +    trace_pci_pm_transition(d->name, pci_dev_bus_num(d), 
> > PCI_SLOT(d->devfn),
> > +                            PCI_FUNC(d->devfn), old, new);
> > +    return new;
> > +}
> > +
> >  static void pci_reset_regions(PCIDevice *dev)
> >  {
> >      int r;
> > @@ -474,6 +542,11 @@ static void pci_do_device_reset(PCIDevice *dev)
> >                                pci_get_word(dev->wmask + 
> > PCI_INTERRUPT_LINE) |
> >                                pci_get_word(dev->w1cmask + 
> > PCI_INTERRUPT_LINE));
> >      dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> > +    /* Default PM state is D0 */
> > +    if (dev->cap_present & QEMU_PCI_CAP_PM) {
> > +        pci_word_test_and_clear_mask(dev->config + dev->pm_cap + 
> > PCI_PM_CTRL,
> > +                                     PCI_PM_CTRL_STATE_MASK);
> > +    }
> >      pci_reset_regions(dev);
> >      pci_update_mappings(dev);
> >  
> > @@ -1598,7 +1671,7 @@ static void pci_update_mappings(PCIDevice *d)
> >              continue;
> >  
> >          new_addr = pci_bar_address(d, i, r->type, r->size);
> > -        if (!d->enabled) {
> > +        if (!d->enabled || pci_pm_state(d)) {
> >              new_addr = PCI_BAR_UNMAPPED;
> >          }
> >  
> > @@ -1664,6 +1737,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
> >  
> >  void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t 
> > val_in, int l)
> >  {
> > +    uint8_t new_pm_state, old_pm_state = pci_pm_state(d);
> >      int i, was_irq_disabled = pci_irq_disabled(d);
> >      uint32_t val = val_in;
> >  
> > @@ -1676,11 +1750,16 @@ void pci_default_write_config(PCIDevice *d, 
> > uint32_t addr, uint32_t val_in, int
> >          d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & 
> > wmask);
> >          d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear 
> > */
> >      }
> > +  
> IIUC the config is first updated and then we check the validity and
> potentially restore the older value.
> Couldn't we check the validity before updating d->config?

We could, but why?  It's easier to deal with the before and after
values rather than mangle the incoming write value, imo.  I also don't
think the intermediate value is visible to the guest anyway.  Fixing
the value "in flight" would need to take into account the wmask,
duplicating or preempting the code just above.  Thanks,

Alex

> > +    new_pm_state = pci_pm_update(d, addr, l, old_pm_state);
> > +
> >      if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
> >          ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
> >          ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
> > -        range_covers_byte(addr, l, PCI_COMMAND))
> > +        range_covers_byte(addr, l, PCI_COMMAND) ||
> > +        !!new_pm_state != !!old_pm_state) {
> >          pci_update_mappings(d);  
> Eric
> > +    }
> >  
> >      if (ranges_overlap(addr, l, PCI_COMMAND, 2)) {
> >          pci_update_irq_disabled(d, was_irq_disabled);
> > diff --git a/hw/pci/trace-events b/hw/pci/trace-events
> > index 19643aa8c6b0..811354f29031 100644
> > --- a/hw/pci/trace-events
> > +++ b/hw/pci/trace-events
> > @@ -1,6 +1,8 @@
> >  # See docs/devel/tracing.rst for syntax documentation.
> >  
> >  # pci.c
> > +pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, 
> > uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x bad PM transition 
> > D%d->D%d"
> > +pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t 
> > func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d"
> >  pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, 
> > uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x 
> > %d,0x%"PRIx64"+0x%"PRIx64
> >  pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, 
> > uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x 
> > %d,0x%"PRIx64"+0x%"PRIx64
> >  pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const 
> > char *parent_path) "IRQ %d @%s -> IRQ %d @%s"
> > diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> > index 4002bbeebde0..c220cc844962 100644
> > --- a/include/hw/pci/pci.h
> > +++ b/include/hw/pci/pci.h
> > @@ -216,6 +216,8 @@ enum {
> >      QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
> >  #define QEMU_PCIE_EXT_TAG_BITNR 13
> >      QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR),
> > +#define QEMU_PCI_CAP_PM_BITNR 14
> > +    QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR),
> >  };
> >  
> >  typedef struct PCIINTxRoute {
> > @@ -676,5 +678,6 @@ static inline void pci_irq_deassert(PCIDevice *pci_dev)
> >  MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
> >  void pci_set_enabled(PCIDevice *pci_dev, bool state);
> >  void pci_set_power(PCIDevice *pci_dev, bool state);
> > +int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp);
> >  
> >  #endif
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index add208edfabd..345b12eaac1a 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -105,6 +105,9 @@ struct PCIDevice {
> >      /* Capability bits */
> >      uint32_t cap_present;
> >  
> > +    /* Offset of PM capability in config space */
> > +    uint8_t pm_cap;
> > +
> >      /* Offset of MSI-X capability in config space */
> >      uint8_t msix_cap;
> >    
> 


Reply via email to