On Fri, 15 Sept 2023 at 05:55, Mattias Nissler <mniss...@rivosinc.com> wrote: > > On Thu, Sep 14, 2023 at 8:49 PM Stefan Hajnoczi <stefa...@redhat.com> wrote: > > > > On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote: > > > When DMA memory can't be directly accessed, as is the case when > > > running the device model in a separate process without shareable DMA > > > file descriptors, bounce buffering is used. > > > > > > It is not uncommon for device models to request mapping of several DMA > > > regions at the same time. Examples include: > > > * net devices, e.g. when transmitting a packet that is split across > > > several TX descriptors (observed with igb) > > > * USB host controllers, when handling a packet with multiple data TRBs > > > (observed with xhci) > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > and would fail DMA map requests while the buffer was already in use. In > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > errors from the guest perspective. > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > correctly also when RAM can't be mmap()-ed. > > > > > > The total bounce buffer allocation size is limited individually for each > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > provided to configure the limit for PCI devices. > > > > > > Signed-off-by: Mattias Nissler <mniss...@rivosinc.com> > > > --- > > > hw/pci/pci.c | 8 ++++ > > > include/exec/memory.h | 14 ++---- > > > include/hw/pci/pci_device.h | 3 ++ > > > softmmu/memory.c | 3 +- > > > softmmu/physmem.c | 94 +++++++++++++++++++++++++------------ > > > 5 files changed, 80 insertions(+), 42 deletions(-) > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > index 881d774fb6..8c4541b394 100644 > > > --- a/hw/pci/pci.c > > > +++ b/hw/pci/pci.c > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > + DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice, > > > + max_bounce_buffer_size, 4096), > > > DEFINE_PROP_END_OF_LIST() > > > }; > > > > > > @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice > > > *pci_dev, > > > "bus master container", UINT64_MAX); > > > address_space_init(&pci_dev->bus_master_as, > > > &pci_dev->bus_master_container_region, > > > pci_dev->name); > > > + pci_dev->bus_master_as.max_bounce_buffer_size = > > > + pci_dev->max_bounce_buffer_size; > > > > > > if (phase_check(PHASE_MACHINE_READY)) { > > > pci_init_bus_master(pci_dev); > > > @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass > > > *klass, void *data) > > > k->unrealize = pci_qdev_unrealize; > > > k->bus_type = TYPE_PCI_BUS; > > > device_class_set_props(k, pci_props); > > > + object_class_property_set_description( > > > + klass, "x-max-bounce-buffer-size", > > > + "Maximum buffer size allocated for bounce buffers used for > > > mapped " > > > + "access to indirect DMA memory"); > > > } > > > > > > static void pci_device_class_base_init(ObjectClass *klass, void *data) > > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > > index 7d68936157..5577542b5e 100644 > > > --- a/include/exec/memory.h > > > +++ b/include/exec/memory.h > > > @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient { > > > QLIST_ENTRY(AddressSpaceMapClient) link; > > > } AddressSpaceMapClient; > > > > > > -typedef struct { > > > - MemoryRegion *mr; > > > - void *buffer; > > > - hwaddr addr; > > > - hwaddr len; > > > - bool in_use; > > > -} BounceBuffer; > > > - > > > /** > > > * struct AddressSpace: describes a mapping of addresses to > > > #MemoryRegion objects > > > */ > > > @@ -1106,8 +1098,10 @@ struct AddressSpace { > > > QTAILQ_HEAD(, MemoryListener) listeners; > > > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > > > > > > - /* Bounce buffer to use for this address space. */ > > > - BounceBuffer bounce; > > > + /* Maximum DMA bounce buffer size used for indirect memory map > > > requests */ > > > + uint64_t max_bounce_buffer_size; > > > + /* Total size of bounce buffers currently allocated, atomically > > > accessed */ > > > + uint64_t bounce_buffer_size; > > > /* List of callbacks to invoke when buffers free up */ > > > QemuMutex map_client_list_lock; > > > QLIST_HEAD(, AddressSpaceMapClient) map_client_list; > > > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h > > > index d3dd0f64b2..f4027c5379 100644 > > > --- a/include/hw/pci/pci_device.h > > > +++ b/include/hw/pci/pci_device.h > > > @@ -160,6 +160,9 @@ struct PCIDevice { > > > /* ID of standby device in net_failover pair */ > > > char *failover_pair_id; > > > uint32_t acpi_index; > > > + > > > + /* Maximum DMA bounce buffer size used for indirect memory map > > > requests */ > > > + uint64_t max_bounce_buffer_size; > > > }; > > > > > > static inline int pci_intx(PCIDevice *pci_dev) > > > diff --git a/softmmu/memory.c b/softmmu/memory.c > > > index 5c9622c3d6..e02799359c 100644 > > > --- a/softmmu/memory.c > > > +++ b/softmmu/memory.c > > > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, > > > MemoryRegion *root, const char *name) > > > as->ioeventfds = NULL; > > > QTAILQ_INIT(&as->listeners); > > > QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); > > > - as->bounce.in_use = false; > > > + as->max_bounce_buffer_size = 4096; > > > + as->bounce_buffer_size = 0; > > > qemu_mutex_init(&as->map_client_list_lock); > > > QLIST_INIT(&as->map_client_list); > > > as->name = g_strdup(name ? name : "anonymous"); > > > diff --git a/softmmu/physmem.c b/softmmu/physmem.c > > > index f40cc564b8..e3d1cf5fba 100644 > > > --- a/softmmu/physmem.c > > > +++ b/softmmu/physmem.c > > > @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr > > > len) > > > NULL, len, FLUSH_CACHE); > > > } > > > > > > +/* > > > + * A magic value stored in the first 8 bytes of the bounce buffer > > > struct. Used > > > + * to detect illegal pointers passed to address_space_unmap. > > > + */ > > > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed > > > + > > > +typedef struct { > > > + uint64_t magic; > > > + MemoryRegion *mr; > > > + hwaddr addr; > > > + size_t len; > > > + uint8_t buffer[]; > > > +} BounceBuffer; > > > + > > > static void > > > address_space_unregister_map_client_do(AddressSpaceMapClient *client) > > > { > > > @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace > > > *as, QEMUBH *bh) > > > QLIST_INSERT_HEAD(&as->map_client_list, client, link); > > > /* Write map_client_list before reading bounce_buffer_size. */ > > > smp_mb(); > > > - if (!qatomic_read(&as->bounce.in_use)) { > > > + if (qatomic_read(&as->bounce_buffer_size) < > > > as->max_bounce_buffer_size) { > > > address_space_notify_map_clients_locked(as); > > > } > > > qemu_mutex_unlock(&as->map_client_list_lock); > > > @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as, > > > RCU_READ_LOCK_GUARD(); > > > fv = address_space_to_flatview(as); > > > mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); > > > + memory_region_ref(mr); > > > > > > if (!memory_access_is_direct(mr, is_write)) { > > > - if (qatomic_xchg(&as->bounce.in_use, true)) { > > > + size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l); > > > + if (size > as->max_bounce_buffer_size) { > > > + size_t excess = size - as->max_bounce_buffer_size; > > > + l -= excess; > > > + qatomic_sub(&as->bounce_buffer_size, excess); > > > > I think two threads can race here. as->bounce_buffer_size will be > > corrupted (smaller than it should be) and l will be wrong as well. A > > cmpxchg loop would solve the race. > > Ah, thanks for pointing this out. I think the case you have in mind is this: > 1. Thread A bumps the size to be larger than the max > 2. Thread B bumps the size further > 3. Thread B now computes an incorrect excess and sutracts more than it added.
Yes, that's the case. > I should be good if I ensure that each thread will only subtract what > it has previously added to enforce the invariant that additions and > subtractions for each map/unmap pair cancel each other out. Let me > know if there's a reason to still prefer the cmpxchg loop. Cool, it would be nice to avoid the cmpxchg loop. Stefan