On 15-Oct-20 6:23 PM, David Christensen wrote:
The SPAPR IOMMU requires that a DMA window size be defined before memory
can be mapped for DMA. Current code dynamically modifies the DMA window
size in response to every new memory allocation which is potentially
dangerous because all existing mappings need to be unmapped/remapped in
order to resize the DMA window, leaving hardware holding IOVA addresses
that are temporarily unmapped.  The new SPAPR code statically assigns
the DMA window size on first use, using the largest physical memory
memory address when IOVA=PA and the highest existing memseg virtual
address when IOVA=VA.

Signed-off-by: David Christensen <d...@linux.vnet.ibm.com>
---

These changes are almost exclusively contained to PPC64 code, so with below changes,

Acked-by: Anatoly Burakov <anatoly.bura...@intel.com>

+static uint64_t
+get_highest_mem_addr(struct spapr_size_walk_param *param)
+{
+       /* find the maximum IOVA address for setting the DMA window size */
+       if (rte_eal_iova_mode() == RTE_IOVA_PA) {
+               static const char proc_iomem[] = "/proc/iomem";
+               static const char str_sysram[] = "System RAM";
+               uint64_t start, end, max = 0;
+               char *line = NULL;
+               char *dash, *space;
+               size_t line_len;
+ /*
+                * Example "System RAM" in /proc/iomem:
+                * 00000000-1fffffffff : System RAM
+                * 200000000000-201fffffffff : System RAM
+                */
+               FILE *fd = fopen(proc_iomem, "r");
+               if (fd == NULL) {
+                       RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_iomem);
+                       return -1;
+               }
+               /* Scan /proc/iomem for the highest PA in the system */
+               while (getline(&line, &line_len, fd) != -1) {
+                       if (strstr(line, str_sysram) == NULL)
+                               continue;
+
+                       space = strstr(line, " ");
+                       dash = strstr(line, "-");
+
+                       /* Validate the format of the memory string */
+                       if (space == NULL || dash == NULL || space < dash) {
+                               RTE_LOG(ERR, EAL, "Can't parse line \"%s\" in file 
%s\n",
+                                       line, proc_iomem);
+                               continue;
+                       }
+
+                       start = strtoull(line, NULL, 16);
+                       end   = strtoull(dash + 1, NULL, 16);
+                       RTE_LOG(DEBUG, EAL, "Found system RAM from 0x%" PRIx64
+                               " to 0x%" PRIx64 "\n", start, end);
+                       if (end > max)
+                               max = end;
+               }
+               free(line);
+               fclose(fd);
+
+               if (max == 0) {
+                       RTE_LOG(ERR, EAL, "Failed to find valid \"System RAM\" "
+                               "entry in file %s\n", proc_iomem);
+                       return -1;
+               }
+
+               return rte_align64pow2(max + 1);
+       } else if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+               RTE_LOG(DEBUG, EAL, "Highest VA address in memseg list is 0x%"
+                       PRIx64 "\n", param->max_va);
+               return rte_align64pow2(param->max_va);
+       }
+
+       RTE_LOG(ERR, EAL, "Unsupported IOVA mode\n");
+       return 0;

You're returning a uint64_t here, while also returning -1 in some of the error cases above, but not here. How about making the address an output parameter, and return 0 on success and -1 on error? It makes the code a bit messier at the call site, but would probably make more sense than returning 0 or -1 depending on which error condition you've hit.

Also, because of this, there's a bug below where you check for return value of 0, but not -1.

+}
+
+
+/*
+ * The SPAPRv2 IOMMU supports 2 DMA windows with starting
+ * address at 0 or 1<<59.  By default, a DMA window is set
+ * at address 0, 2GB long, with a 4KB page.  For DPDK we
+ * must remove the default window and setup a new DMA window
+ * based on the hugepage size and memory requirements of
+ * the application before we can map memory for DMA.
+ */
  static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
-               const struct rte_memseg *ms, void *arg)
+spapr_dma_win_size(void)
  {
-       struct spapr_walk_param *param = arg;
-       uint64_t max = ms->iova + ms->len;
-
-       /* skip external memory that isn't a heap */
-       if (msl->external && !msl->heap)
-               return 0;
+       struct spapr_size_walk_param param;
- /* skip any segments with invalid IOVA addresses */
-       if (ms->iova == RTE_BAD_IOVA)
+       /* only create DMA window once */
+       if (spapr_dma_win_len > 0)
                return 0;
- if (max > param->window_size) {
-               param->hugepage_sz = ms->hugepage_sz;
-               param->window_size = max;
+       /* walk the memseg list to find the page size/max VA address */
+       memset(&param, 0, sizeof(param));
+       if (rte_memseg_list_walk(vfio_spapr_size_walk, &param) < 0) {
+               RTE_LOG(ERR, EAL, "Failed to walk memseg list for DMA window 
size\n");
+               return -1;
        }
+ /* We can't be sure if DMA window covers external memory */
+       if (param.is_user_managed)
+               RTE_LOG(WARNING, EAL, "Detected user managed external memory which 
may not be managed by the IOMMU\n");
+
+       spapr_dma_win_len = get_highest_mem_addr(&param);
+       if (spapr_dma_win_len == 0)
+               return -1;

This error check doesn't catch all errors, as indicated above.

+       RTE_LOG(DEBUG, EAL, "Setting DMA window size to 0x%" PRIx64 "\n",
+               spapr_dma_win_len);
+       spapr_dma_win_page_sz = param.page_sz;
+       rte_mem_set_dma_mask(__builtin_ctzll(spapr_dma_win_len));
        return 0;
  }
static int
-vfio_spapr_create_new_dma_window(int vfio_container_fd,
-               struct vfio_iommu_spapr_tce_create *create) {
+vfio_spapr_create_dma_window(int vfio_container_fd)
+{
+       struct vfio_iommu_spapr_tce_create create = {
+               .argsz = sizeof(create), };
        struct vfio_iommu_spapr_tce_remove remove = {
-               .argsz = sizeof(remove),
-       };
+               .argsz = sizeof(remove), };
        struct vfio_iommu_spapr_tce_info info = {
-               .argsz = sizeof(info),
-       };
+               .argsz = sizeof(info), };
        int ret;
- /* query spapr iommu info */
+       ret = spapr_dma_win_size();
+       if (ret < 0)
+               return ret;
+
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
        if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot get iommu info, "
-                               "error %i (%s)\n", errno, strerror(errno));
+               RTE_LOG(ERR, EAL, "  can't get iommu info, error %i (%s)\n",
+                       errno, strerror(errno));
                return -1;
        }
- /* remove default DMA of 32 bit window */
+       /* remove default DMA window */
        remove.start_addr = info.dma32_window_start;
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);

If you're never recreating a window, does it need to be removed? Or is this some kind of default window that is always present?

-       if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
-                               "error %i (%s)\n", errno, strerror(errno));
+       if (ret)
                return -1;
-       }
- /* create new DMA window */
-       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+       /* create a new DMA window (start address is not selectable) */
+       create.window_size = spapr_dma_win_len;
+       create.page_shift  = __builtin_ctzll(spapr_dma_win_page_sz);
+       create.levels = 1;
+       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
        if (ret) {
-#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
-               /* try possible page_shift and levels for workaround */
+               /* if at first we don't succeed, try more levels */
                uint32_t levels;
- for (levels = create->levels + 1;
+               for (levels = create.levels + 1;
                        ret && levels <= info.ddw.levels; levels++) {
-                       create->levels = levels;
+                       create.levels = levels;
                        ret = ioctl(vfio_container_fd,
-                               VFIO_IOMMU_SPAPR_TCE_CREATE, create);
-               }
-#endif
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
-                                       "error %i (%s)\n", errno, 
strerror(errno));
-                       return -1;
+                               VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
                }
        }
-
-       if (create->start_addr != 0) {
-               RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  cannot create new DMA window, error %i 
(%s)\n",
+                       errno, strerror(errno));
+               RTE_LOG(ERR, EAL, "  consider using a larger hugepage size "
+                       "if supported by the system\n");
                return -1;
        }
- return 0;
+       /* verify the start address  */
+       if (create.start_addr != 0) {
+               RTE_LOG(ERR, EAL, "  received unsupported start address 0x%"
+                       PRIx64 "\n", (uint64_t)create.start_addr);
+               return -1;
+       }
+       return ret;
  }
static int
-vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map)
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr,
+       uint64_t iova, uint64_t len, int do_map)

Nitpick, but this bit after newline should have two indents, not one.

--
Thanks,
Anatoly

Reply via email to