date:20250315

[PATCH 11/13] arch, mm: streamline HIGHMEM freeing

2025-03-15 Thread Mike Rapoport

From: "Mike Rapoport (Microsoft)" 

All architectures that support HIGHMEM have their code that frees high
memory pages to the buddy allocator while __free_memory_core() is limited
to freeing only low memory.

There is no actual reason for that. The memory map is completely ready
by the time memblock_free_all() is called and high pages can be released to
the buddy allocator along with low memory.

Remove low memory limit from __free_memory_core() and drop per-architecture
code that frees high memory pages.

Signed-off-by: Mike Rapoport (Microsoft) 
---
 arch/arc/mm/init.c |  6 +-
 arch/arm/mm/init.c | 29 -
 arch/csky/mm/init.c| 14 --
 arch/microblaze/mm/init.c  | 16 
 arch/mips/mm/init.c| 20 
 arch/powerpc/mm/mem.c  | 14 --
 arch/sparc/mm/init_32.c| 25 -
 arch/x86/include/asm/highmem.h |  3 ---
 arch/x86/include/asm/numa.h|  4 
 arch/x86/include/asm/numa_32.h | 13 -
 arch/x86/mm/Makefile   |  2 --
 arch/x86/mm/highmem_32.c   | 34 --
 arch/x86/mm/init_32.c  | 28 
 arch/xtensa/mm/init.c  | 29 -
 include/linux/mm.h |  1 -
 mm/memblock.c  |  3 +--
 16 files changed, 2 insertions(+), 239 deletions(-)
 delete mode 100644 arch/x86/include/asm/numa_32.h
 delete mode 100644 arch/x86/mm/highmem_32.c

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 05025122e965..11ce638731c9 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -160,11 +160,7 @@ void __init setup_arch_memory(void)
 static void __init highmem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
-   unsigned long tmp;
-
memblock_phys_free(high_mem_start, high_mem_sz);
-   for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
-   free_highmem_page(pfn_to_page(tmp));
 #endif
 }
 
@@ -176,8 +172,8 @@ static void __init highmem_init(void)
  */
 void __init mem_init(void)
 {
-   memblock_free_all();
highmem_init();
+   memblock_free_all();
 
BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE);
BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index d4bcc745a044..7bb5ce02b9b5 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -237,33 +237,6 @@ static inline void poison_init_mem(void *s, size_t count)
*p++ = 0xe7fddef0;
 }
 
-static void __init free_highpages(void)
-{
-#ifdef CONFIG_HIGHMEM
-   unsigned long max_low = max_low_pfn;
-   phys_addr_t range_start, range_end;
-   u64 i;
-
-   /* set highmem page free */
-   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
-   &range_start, &range_end, NULL) {
-   unsigned long start = PFN_UP(range_start);
-   unsigned long end = PFN_DOWN(range_end);
-
-   /* Ignore complete lowmem entries */
-   if (end <= max_low)
-   continue;
-
-   /* Truncate partial highmem entries */
-   if (start < max_low)
-   start = max_low;
-
-   for (; start < end; start++)
-   free_highmem_page(pfn_to_page(start));
-   }
-#endif
-}
-
 /*
  * mem_init() marks the free areas in the mem_map and tells us how much
  * memory is free.  This is done after various parts of the system have
@@ -283,8 +256,6 @@ void __init mem_init(void)
/* this will put all unused low memory onto the freelists */
memblock_free_all();
 
-   free_highpages();
-
/*
 * Check boundaries twice: Some fundamental inconsistencies can
 * be detected at build time already.
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index a22801aa503a..3914c2b873da 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -44,21 +44,7 @@ EXPORT_SYMBOL(empty_zero_page);
 
 void __init mem_init(void)
 {
-#ifdef CONFIG_HIGHMEM
-   unsigned long tmp;
-#endif
-
memblock_free_all();
-
-#ifdef CONFIG_HIGHMEM
-   for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
-   struct page *page = pfn_to_page(tmp);
-
-   /* FIXME not sure about */
-   if (!memblock_is_reserved(tmp << PAGE_SHIFT))
-   free_highmem_page(page);
-   }
-#endif
 }
 
 void free_initmem(void)
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 7e2e342e84c5..3e664e0efc33 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -52,19 +52,6 @@ static void __init highmem_init(void)
map_page(PKMAP_BASE, 0, 0); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
 }
-
-static void __meminit highmem_setup(void)
-{
-   unsigned long pfn;
-
-

[PATCH 09/13] arch, mm: set max_mapnr when allocating memory map for FLATMEM

2025-03-15 Thread Mike Rapoport

From: "Mike Rapoport (Microsoft)" 

max_mapnr is essentially the size of the memory map for systems that use
FLATMEM. There is no reason to calculate it in each and every architecture
when it's anyway calculated in alloc_node_mem_map().

Drop setting of max_mapnr from architecture code and set it once in
alloc_node_mem_map().

While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so
there won't be two copies for MMU and !MMU variants.

Signed-off-by: Mike Rapoport (Microsoft) 
---
 arch/alpha/mm/init.c   |  1 -
 arch/arc/mm/init.c |  5 -
 arch/arm/mm/init.c |  2 --
 arch/csky/mm/init.c|  4 
 arch/loongarch/mm/init.c   |  1 -
 arch/microblaze/mm/init.c  |  4 
 arch/mips/mm/init.c|  8 
 arch/nios2/kernel/setup.c  |  1 -
 arch/nios2/mm/init.c   |  2 +-
 arch/openrisc/mm/init.c|  1 -
 arch/parisc/mm/init.c  |  1 -
 arch/powerpc/kernel/setup-common.c |  2 --
 arch/riscv/mm/init.c   |  1 -
 arch/s390/mm/init.c|  1 -
 arch/sh/mm/init.c  |  1 -
 arch/sparc/mm/init_32.c|  1 -
 arch/um/include/shared/mem_user.h  |  1 -
 arch/um/kernel/physmem.c   | 12 
 arch/um/kernel/um_arch.c   |  1 -
 arch/x86/mm/init_32.c  |  3 ---
 arch/xtensa/mm/init.c  |  1 -
 include/asm-generic/memory_model.h |  5 +++--
 include/linux/mm.h | 11 ---
 mm/memory.c|  8 
 mm/mm_init.c   | 25 +
 mm/nommu.c |  4 
 26 files changed, 21 insertions(+), 86 deletions(-)

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 61c2198b1359..ec0eeae9c653 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -276,7 +276,6 @@ srm_paging_stop (void)
 void __init
 mem_init(void)
 {
-   set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
memblock_free_all();
 }
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 6a71b23f1383..7ef883d58dc1 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -154,11 +154,6 @@ void __init setup_arch_memory(void)
 
arch_pfn_offset = min(min_low_pfn, min_high_pfn);
kmap_init();
-
-#else /* CONFIG_HIGHMEM */
-   /* pfn_valid() uses this when FLATMEM=y and HIGHMEM=n */
-   max_mapnr = max_low_pfn - min_low_pfn;
-
 #endif /* CONFIG_HIGHMEM */
 
free_area_init(max_zone_pfn);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 9aec1cb2386f..d4bcc745a044 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -275,8 +275,6 @@ void __init mem_init(void)
swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE);
 #endif
 
-   set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
-
 #ifdef CONFIG_SA
/* now that our DMA memory is actually so designated, we can free it */
memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET);
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index ab51acbc19b2..ba6694d6170a 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -46,10 +46,6 @@ void __init mem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
unsigned long tmp;
-
-   set_max_mapnr(highend_pfn - ARCH_PFN_OFFSET);
-#else
-   set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
 #endif
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index ca5aa5f46a9f..00449df50db1 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -78,7 +78,6 @@ void __init paging_init(void)
 
 void __init mem_init(void)
 {
-   max_mapnr = max_low_pfn;
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
memblock_free_all();
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 4520c5741579..857cd2b44bcf 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -104,17 +104,13 @@ void __init setup_memory(void)
 *
 * min_low_pfn - the first page (mm/bootmem.c - node_boot_start)
 * max_low_pfn
-* max_mapnr - the first unused page (mm/bootmem.c - node_low_pfn)
 */
 
/* memory start is from the kernel end (aligned) to higher addr */
min_low_pfn = memory_start >> PAGE_SHIFT; /* minimum for allocation */
-   /* RAM is assumed contiguous */
-   max_mapnr = memory_size >> PAGE_SHIFT;
max_low_pfn = ((u64)memory_start + (u64)lowmem_size) >> PAGE_SHIFT;
max_pfn = ((u64)memory_start + (u64)memory_size) >> PAGE_SHIFT;
 
-   pr_info("%s: max_mapnr: %#lx\n", __func__, max_mapnr);
pr_info("%s: min_low_pfn: %#lx\n", __func__, min_low_pfn);
pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn);
pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn);
diff --git

[PATCH 0/3] um: Add VFIO-based PCI passthrough support

2025-03-15 Thread Tiwei Bie

This patchset adds a new virt-pci driver to UML that allows users to
pass through host PCI devices to UML via VFIO. Currently, only MSI-X
capable devices are supported, and drivers are assumed to use MSI-X.

This driver has been tested with virtio-net-pci, virtio-blk-pci and
nvme in a QEMU virtual machine with virtual IOMMU enabled. Regression
testing for virtio_pcidev has only covered compile tests and partial
virtio probe/remove tests with a dummy vhost-user backend.

Here are some steps to try out this driver:

1. Enable IOMMU and bind PCI devices to the vfio-pci driver.

2. Launch a UML instance directly (assuming that the PCI devices to
   be passed through to UML are :02:00.0 and :03:00.0).

 $ ./linux mem=2G init=/bin/sh ubd0=$your_rootfs_image \
   vfio_uml.device=:02:00.0 \
   vfio_uml.device=:03:00.0

Note that, currently, it's not possible to allocate a large amount of
"physical" memory to the UML instance; otherwise, it may overlap with
IOMMU-reserved IOVA ranges (e.g. [0xfee0, 0xfef0)), causing
DMA mapping failures.

This patchset is based on the following patchset:
https://lore.kernel.org/all/20250306150747.2926434-1-tiwei@antgroup.com/

v1:
- Add more details in the commit log;

RFC: 
https://lore.kernel.org/all/20250310074057.3977758-1-tiwei@antgroup.com/

Tiwei Bie (3):
  um: Rewrite the sigio workaround based on epoll and tgkill
  um: virt-pci: Refactor virtio_pcidev into its own module
  um: Add VFIO-based virtual PCI driver

 arch/um/drivers/Kconfig |  20 +-
 arch/um/drivers/Makefile|   5 +-
 arch/um/drivers/random.c|   2 +-
 arch/um/drivers/rtc_user.c  |   2 +-
 arch/um/drivers/vfio_kern.c | 648 +
 arch/um/drivers/vfio_user.c | 323 +++
 arch/um/drivers/vfio_user.h |  44 ++
 arch/um/drivers/virt-pci.c  | 699 ++--
 arch/um/drivers/virt-pci.h  |  41 ++
 arch/um/drivers/virtio_pcidev.c | 628 
 arch/um/include/shared/os.h |   2 +-
 arch/um/include/shared/sigio.h  |   1 -
 arch/um/kernel/sigio.c  |  26 --
 arch/um/os-Linux/sigio.c| 330 ++-
 14 files changed, 1866 insertions(+), 905 deletions(-)
 create mode 100644 arch/um/drivers/vfio_kern.c
 create mode 100644 arch/um/drivers/vfio_user.c
 create mode 100644 arch/um/drivers/vfio_user.h
 create mode 100644 arch/um/drivers/virt-pci.h
 create mode 100644 arch/um/drivers/virtio_pcidev.c

-- 
2.34.1

[PATCH 2/3] um: virt-pci: Refactor virtio_pcidev into its own module

2025-03-15 Thread Tiwei Bie

Decouple virt-pci and virtio_pcidev, refactoring virtio_pcidev into
its own module. Define a set of APIs for virt-pci. This allows for
future addition of more PCI emulation implementations.

Signed-off-by: Tiwei Bie 
---
 arch/um/drivers/Kconfig |  12 +-
 arch/um/drivers/Makefile|   3 +-
 arch/um/drivers/virt-pci.c  | 699 ++--
 arch/um/drivers/virt-pci.h  |  41 ++
 arch/um/drivers/virtio_pcidev.c | 628 
 5 files changed, 794 insertions(+), 589 deletions(-)
 create mode 100644 arch/um/drivers/virt-pci.h
 create mode 100644 arch/um/drivers/virtio_pcidev.c

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index ede40a160c5e..9cb196070614 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -345,16 +345,20 @@ config UML_RTC
  by providing a fake RTC clock that causes a wakeup at the right
  time.
 
-config UML_PCI_OVER_VIRTIO
-   bool "Enable PCI over VIRTIO device simulation"
-   # in theory, just VIRTIO is enough, but that causes recursion
-   depends on VIRTIO_UML
+config UML_PCI
+   bool
select FORCE_PCI
select UML_IOMEM_EMULATION
select UML_DMA_EMULATION
select PCI_MSI
select PCI_LOCKLESS_CONFIG
 
+config UML_PCI_OVER_VIRTIO
+   bool "Enable PCI over VIRTIO device simulation"
+   # in theory, just VIRTIO is enough, but that causes recursion
+   depends on VIRTIO_UML
+   select UML_PCI
+
 config UML_PCI_OVER_VIRTIO_DEVICE_ID
int "set the virtio device ID for PCI emulation"
default -1
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 57882e6bc215..0a5820343ad3 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -60,7 +60,8 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o
 obj-$(CONFIG_UML_RANDOM) += random.o
 obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o
 obj-$(CONFIG_UML_RTC) += rtc.o
-obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o
+obj-$(CONFIG_UML_PCI) += virt-pci.o
+obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
 
 # pcap_user.o must be added explicitly.
 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o 
vector_user.o
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index dd5580f975cc..b83b5a765d4e 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -5,52 +5,19 @@
  */
 #include 
 #include 
-#include 
-#include 
 #include 
 #include 
 #include 
-#include 
-#include 
-#include 
 #include 
 #include 
 #include 
 
+#include "virt-pci.h"
+
 #define MAX_DEVICES 8
 #define MAX_MSI_VECTORS 32
 #define CFG_SPACE_SIZE 4096
 
-/* for MSI-X we have a 32-bit payload */
-#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32))
-#define NUM_IRQ_MSGS   10
-
-struct um_pci_message_buffer {
-   struct virtio_pcidev_msg hdr;
-   u8 data[8];
-};
-
-struct um_pci_device {
-   struct virtio_device *vdev;
-
-   /* for now just standard BARs */
-   u8 resptr[PCI_STD_NUM_BARS];
-
-   struct virtqueue *cmd_vq, *irq_vq;
-
-#define UM_PCI_WRITE_BUFS  20
-   struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1];
-   void *extra_ptrs[UM_PCI_WRITE_BUFS + 1];
-   DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS);
-
-#define UM_PCI_STAT_WAITING0
-   unsigned long status;
-
-   int irq;
-
-   bool platform;
-};
-
 struct um_pci_device_reg {
struct um_pci_device *dev;
void __iomem *iomem;
@@ -65,179 +32,15 @@ static struct irq_domain *um_pci_inner_domain;
 static struct irq_domain *um_pci_msi_domain;
 static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];
 
-static unsigned int um_pci_max_delay_us = 4;
-module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644);
-
-static int um_pci_get_buf(struct um_pci_device *dev, bool *posted)
-{
-   int i;
-
-   for (i = 0; i < UM_PCI_WRITE_BUFS; i++) {
-   if (!test_and_set_bit(i, dev->used_bufs))
-   return i;
-   }
-
-   *posted = false;
-   return UM_PCI_WRITE_BUFS;
-}
-
-static void um_pci_free_buf(struct um_pci_device *dev, void *buf)
-{
-   int i;
-
-   if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) {
-   kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]);
-   dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL;
-   return;
-   }
-
-   for (i = 0; i < UM_PCI_WRITE_BUFS; i++) {
-   if (buf == &dev->bufs[i]) {
-   kfree(dev->extra_ptrs[i]);
-   dev->extra_ptrs[i] = NULL;
-   WARN_ON(!test_and_clear_bit(i, dev->used_bufs));
-   return;
-   }
-   }
-
-   WARN_ON(1);
-}
-
-static int um_pci_send_cmd(struct um_pci_device *dev,
-  struct virtio_pcidev_msg *cmd,
-  unsigned int cmd_size,
-  const void *extra, un

[PATCH 3/3] um: Add VFIO-based virtual PCI driver

2025-03-15 Thread Tiwei Bie

Implement a new virtual PCI driver based on the VFIO framework.
This driver allows users to pass through PCI devices to UML via
VFIO. Currently, only MSI-X capable devices are supported, and
it is assumed that drivers will use MSI-X.

Signed-off-by: Tiwei Bie 
---
 arch/um/drivers/Kconfig |   8 +
 arch/um/drivers/Makefile|   2 +
 arch/um/drivers/vfio_kern.c | 648 
 arch/um/drivers/vfio_user.c | 323 ++
 arch/um/drivers/vfio_user.h |  44 +++
 5 files changed, 1025 insertions(+)
 create mode 100644 arch/um/drivers/vfio_kern.c
 create mode 100644 arch/um/drivers/vfio_user.c
 create mode 100644 arch/um/drivers/vfio_user.h

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 9cb196070614..d7bb447ff958 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -367,3 +367,11 @@ config UML_PCI_OVER_VIRTIO_DEVICE_ID
  There's no official device ID assigned (yet), set the one you
  wish to use for experimentation here. The default of -1 is
  not valid and will cause the driver to fail at probe.
+
+config UML_PCI_OVER_VFIO
+   bool "Enable VFIO-based PCI passthrough"
+   select UML_PCI
+   help
+ This driver provides support for VFIO-based PCI passthrough.
+ Currently, only MSI-X capable devices are supported, and it
+ is assumed that drivers will use MSI-X.
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 0a5820343ad3..336be56b8975 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -19,6 +19,7 @@ port-objs := port_kern.o port_user.o
 harddog-objs := harddog_kern.o
 harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o
 rtc-objs := rtc_kern.o rtc_user.o
+vfio_uml-objs := vfio_kern.o vfio_user.o
 
 LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a)
 
@@ -62,6 +63,7 @@ obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o
 obj-$(CONFIG_UML_RTC) += rtc.o
 obj-$(CONFIG_UML_PCI) += virt-pci.o
 obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
+obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o
 
 # pcap_user.o must be added explicitly.
 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o 
vector_user.o
diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
new file mode 100644
index ..805f589a568d
--- /dev/null
+++ b/arch/um/drivers/vfio_kern.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie 
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "virt-pci.h"
+#include "vfio_user.h"
+
+#define MAX_GROUPS 8
+#define MAX_DEVICES8
+
+#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
+
+struct uml_vfio_intr_ctx {
+   struct uml_vfio_device *dev;
+   int irq;
+};
+
+struct uml_vfio_device {
+   const char *name;
+   int group;
+
+   struct um_pci_device pdev;
+   struct uml_vfio_user_device udev;
+   struct uml_vfio_intr_ctx *intr_ctx;
+
+   int msix_cap;
+   int msix_bar;
+   int msix_offset;
+   int msix_size;
+   u32 *msix_data;
+};
+
+static struct {
+   int fd;
+   int users;
+} uml_vfio_container;
+static DEFINE_MUTEX(uml_vfio_container_mtx);
+
+static struct {
+   int id;
+   int fd;
+   int users;
+} uml_vfio_groups[MAX_GROUPS];
+static DEFINE_MUTEX(uml_vfio_groups_mtx);
+
+static struct uml_vfio_device *uml_vfio_devices[MAX_DEVICES];
+
+static int uml_vfio_open_container(void)
+{
+   int fd;
+
+   fd = uml_vfio_user_open_container();
+   if (fd < 0)
+   return fd;
+
+   uml_vfio_container.fd = fd;
+   return 0;
+}
+
+static void uml_vfio_release_container(void)
+{
+   os_close_file(uml_vfio_container.fd);
+}
+
+static int uml_vfio_set_container(int group_fd)
+{
+   int err;
+
+   guard(mutex)(¨_vfio_container_mtx);
+
+   err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
+   if (err)
+   return err;
+
+   uml_vfio_container.users++;
+   if (uml_vfio_container.users > 1)
+   return 0;
+
+   err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
+   if (err) {
+   uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+   uml_vfio_container.users--;
+   }
+   return err;
+}
+
+static void uml_vfio_unset_container(int group_fd)
+{
+   guard(mutex)(¨_vfio_container_mtx);
+
+   uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+   uml_vfio_container.users--;
+}
+
+static int uml_vfio_open_group(int group_id)
+{
+   int free = -1, err, fd, i;
+
+   guard(mutex)(¨_vfio_groups_mtx);
+
+   for (i = 0; i < MAX_GROUPS; i++) {
+   if (uml_vfio_groups[i].users > 0 &&
+   uml_vfio_groups[i].id == group_id) {
+   uml_v

[PATCH 1/3] um: Rewrite the sigio workaround based on epoll and tgkill

2025-03-15 Thread Tiwei Bie

The existing sigio workaround implementation removes FDs from the
poll when events are triggered, requiring users to re-add them via
add_sigio_fd() after processing. This introduces a potential race
condition between FD removal in write_sigio_thread() and next_poll
update in __add_sigio_fd(), and is inefficient due to frequent FD
removal and re-addition. Rewrite the implementation based on epoll
and tgkill for improved efficiency and reliability.

Signed-off-by: Tiwei Bie 
---
 arch/um/drivers/random.c   |   2 +-
 arch/um/drivers/rtc_user.c |   2 +-
 arch/um/include/shared/os.h|   2 +-
 arch/um/include/shared/sigio.h |   1 -
 arch/um/kernel/sigio.c |  26 ---
 arch/um/os-Linux/sigio.c   | 330 +
 6 files changed, 47 insertions(+), 316 deletions(-)

diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index da985e0dc69a..ca08c91f47a3 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -79,7 +79,7 @@ static int __init rng_init (void)
if (err < 0)
goto err_out_cleanup_hw;
 
-   sigio_broken(random_fd);
+   sigio_broken();
hwrng.name = RNG_MODULE_NAME;
hwrng.read = rng_dev_read;
 
diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c
index 7c3cec4c68cf..51e79f3148cd 100644
--- a/arch/um/drivers/rtc_user.c
+++ b/arch/um/drivers/rtc_user.c
@@ -39,7 +39,7 @@ int uml_rtc_start(bool timetravel)
}
 
/* apparently timerfd won't send SIGIO, use workaround */
-   sigio_broken(uml_rtc_irq_fds[0]);
+   sigio_broken();
err = add_sigio_fd(uml_rtc_irq_fds[0]);
if (err < 0) {
close(uml_rtc_irq_fds[0]);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index c4f8f990ffb8..9a146912dd75 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -315,7 +315,7 @@ extern void um_irqs_resume(void);
 extern int add_sigio_fd(int fd);
 extern int ignore_sigio_fd(int fd);
 extern void maybe_sigio_broken(int fd);
-extern void sigio_broken(int fd);
+extern void sigio_broken(void);
 /*
  * unlocked versions for IRQ controller code.
  *
diff --git a/arch/um/include/shared/sigio.h b/arch/um/include/shared/sigio.h
index e60c8b227844..c6c2edce1f6d 100644
--- a/arch/um/include/shared/sigio.h
+++ b/arch/um/include/shared/sigio.h
@@ -6,7 +6,6 @@
 #ifndef __SIGIO_H__
 #define __SIGIO_H__
 
-extern int write_sigio_irq(int fd);
 extern void sigio_lock(void);
 extern void sigio_unlock(void);
 
diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c
index 5085a50c3b8c..4fc04742048a 100644
--- a/arch/um/kernel/sigio.c
+++ b/arch/um/kernel/sigio.c
@@ -8,32 +8,6 @@
 #include 
 #include 
 
-/* Protected by sigio_lock() called from write_sigio_workaround */
-static int sigio_irq_fd = -1;
-
-static irqreturn_t sigio_interrupt(int irq, void *data)
-{
-   char c;
-
-   os_read_file(sigio_irq_fd, &c, sizeof(c));
-   return IRQ_HANDLED;
-}
-
-int write_sigio_irq(int fd)
-{
-   int err;
-
-   err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt,
-0, "write sigio", NULL);
-   if (err < 0) {
-   printk(KERN_ERR "write_sigio_irq : um_request_irq failed, "
-  "err = %d\n", err);
-   return -1;
-   }
-   sigio_irq_fd = fd;
-   return 0;
-}
-
 /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
 static DEFINE_MUTEX(sigio_mutex);
 
diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index 61b348a2ea97..a05a6ecee756 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -23,180 +24,49 @@
  */
 static struct os_helper_thread *write_sigio_td;
 
-/*
- * These arrays are initialized before the sigio thread is started, and
- * the descriptors closed after it is killed.  So, it can't see them change.
- * On the UML side, they are changed under the sigio_lock.
- */
-#define SIGIO_FDS_INIT {-1, -1}
-
-static int write_sigio_fds[2] = SIGIO_FDS_INIT;
-static int sigio_private[2] = SIGIO_FDS_INIT;
+static int epollfd = -1;
 
-struct pollfds {
-   struct pollfd *poll;
-   int size;
-   int used;
-};
+#define MAX_EPOLL_EVENTS 64
 
-/*
- * Protected by sigio_lock().  Used by the sigio thread, but the UML thread
- * synchronizes with it.
- */
-static struct pollfds current_poll;
-static struct pollfds next_poll;
-static struct pollfds all_sigio_fds;
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
 
 static void *write_sigio_thread(void *unused)
 {
-   struct pollfds *fds, tmp;
-   struct pollfd *p;
-   int i, n, respond_fd;
-   char c;
+   int pid = getpid();
+   int r;
 
os_fix_helper_thread_signals();
 
-   fds = ¤t_poll;
while (1) {
-

[PATCH] um: work around sched_yield not yielding in time-travel mode

2025-03-15 Thread Benjamin Berg

From: Benjamin Berg 

sched_yield by a userspace may not actually cause scheduling in
time-travel mode as no time has passed. In the case seen it appears to
be a badly implemented userspace spinlock in ASAN. Unfortunately, with
time-travel it causes an extreme slowdown or even deadlock depending on
the kernel configuration (CONFIG_UML_MAX_USERSPACE_ITERATIONS).

Work around it by accounting time to the process whenever it executes a
sched_yield syscall.

Signed-off-by: Benjamin Berg 

---

I suspect it is this code in ASAN that uses sched_yield
  
https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/sanitizer_common/sanitizer_mutex.cpp
though there are also some other places that use sched_yield.

I doubt that code is reasonable. At the same time, not sure that
sched_yield is behaving as advertised either as it obviously is not
necessarily relinquishing the CPU.
---
 arch/um/include/linux/time-internal.h |  2 ++
 arch/um/kernel/skas/syscall.c | 11 +++
 2 files changed, 13 insertions(+)

diff --git a/arch/um/include/linux/time-internal.h 
b/arch/um/include/linux/time-internal.h
index b6634ff6..138908b999d7 100644
--- a/arch/um/include/linux/time-internal.h
+++ b/arch/um/include/linux/time-internal.h
@@ -83,6 +83,8 @@ extern void time_travel_not_configured(void);
 #define time_travel_del_event(...) time_travel_not_configured()
 #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
 
+extern unsigned long tt_extra_sched_jiffies;
+
 /*
  * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used,
  * which is intentional since we really shouldn't link it in that case.
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index b09e85279d2b..a5beaea2967e 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -31,6 +31,17 @@ void handle_syscall(struct uml_pt_regs *r)
goto out;
 
syscall = UPT_SYSCALL_NR(r);
+
+   /*
+* If no time passes, then sched_yield may not actually yield, causing
+* broken spinlock implementations in userspace (ASAN) to hang for long
+* periods of time.
+*/
+   if ((time_travel_mode == TT_MODE_INFCPU ||
+time_travel_mode == TT_MODE_EXTERNAL) &&
+   syscall == __NR_sched_yield)
+   tt_extra_sched_jiffies += 1;
+
if (syscall >= 0 && syscall < __NR_syscalls) {
unsigned long ret = EXECUTE_SYSCALL(syscall, regs);
 
-- 
2.48.1

[PATCH 11/13] arch, mm: streamline HIGHMEM freeing

[PATCH 09/13] arch, mm: set max_mapnr when allocating memory map for FLATMEM

[PATCH 0/3] um: Add VFIO-based PCI passthrough support

[PATCH 2/3] um: virt-pci: Refactor virtio_pcidev into its own module

[PATCH 3/3] um: Add VFIO-based virtual PCI driver

[PATCH 1/3] um: Rewrite the sigio workaround based on epoll and tgkill

[PATCH] um: work around sched_yield not yielding in time-travel mode

7 matches

Site Navigation

Mail list logo

Footer information