On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <[EMAIL PROTECTED]> wrote:

> The following patch is for batching up the flushing of the IOTLB for
> the DMAR implementation found in the Intel VT-d hardware.  It works by
> building a list of to be flushed IOTLB entries and a bitmap list of
> which DMAR engine they are from.
> 
> After either a high water mark (250 accessible via debugfs) or 10ms the
> list of iova's will be reclaimed and the DMAR engines associated are
> IOTLB-flushed.
> 
> This approach recovers 15 to 20% of the performance lost when using the
> IOMMU for my netperf udp stream benchmark with small packets.  It can be
> disabled with a kernel boot parameter
> "intel_iommu=strict".
> 
> Its use does weaken the IOMMU protections a bit.
> 
> I would like to see this go into MM for a while and then onto mainline.
> 
> ...
>
> +static struct timer_list unmap_timer =
> +     TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);

Could use DEFINE_TIMER here.

> +struct unmap_list {
> +     struct list_head list;
> +     struct dmar_domain *domain;
> +     struct iova *iova;
> +};

unmap_list doens't seem to be used anywhere?

> +static struct intel_iommu *g_iommus;
> +/* bitmap for indexing intel_iommus */
> +static unsigned long         *g_iommus_to_flush;
> +static int g_num_of_iommus;
> +
> +static DEFINE_SPINLOCK(async_umap_flush_lock);
> +static LIST_HEAD(unmaps_to_do);
> +
> +static int timer_on;
> +static long list_size;
> +static int high_watermark;
> +
> +static struct dentry *intel_iommu_debug, *debug;
> +
> +
>  static void domain_remove_dev_info(struct dmar_domain *domain);
>  
>  static int dmar_disabled;
>  static int __initdata dmar_map_gfx = 1;
>  static int dmar_forcedac;
> +static int intel_iommu_strict;
>  
>  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
>  static DEFINE_SPINLOCK(device_domain_lock);
> @@ -73,9 +103,13 @@
>                       printk(KERN_INFO
>                               "Intel-IOMMU: disable GFX device mapping\n");
>               } else if (!strncmp(str, "forcedac", 8)) {
> -                     printk (KERN_INFO
> +                     printk(KERN_INFO
>                               "Intel-IOMMU: Forcing DAC for PCI devices\n");
>                       dmar_forcedac = 1;
> +             } else if (!strncmp(str, "strict", 8)) {

s/8/6/

> +                     printk(KERN_INFO
> +                             "Intel-IOMMU: disable batched IOTLB flush\n");
> +                     intel_iommu_strict = 1;
>               }
>  
>               str += strcspn(str, ",");
> @@ -965,17 +999,13 @@
>               set_bit(0, iommu->domain_ids);
>       return 0;
>  }
> -
> -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> +                                     struct dmar_drhd_unit *drhd)
>  {
> -     struct intel_iommu *iommu;
>       int ret;
>       int map_size;
>       u32 ver;
>  
> -     iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> -     if (!iommu)
> -             return NULL;
>       iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
>       if (!iommu->reg) {
>               printk(KERN_ERR "IOMMU: can't map the region\n");
> @@ -1396,7 +1426,7 @@
>       int index;
>  
>       while (dev) {
> -             for (index = 0; index < cnt; index ++)
> +             for (index = 0; index < cnt; index++)
>                       if (dev == devices[index])
>                               return 1;
>  
> @@ -1661,7 +1691,7 @@
>       struct dmar_rmrr_unit *rmrr;
>       struct pci_dev *pdev;
>       struct intel_iommu *iommu;
> -     int ret, unit = 0;
> +     int nlongs, i, ret, unit = 0;
>  
>       /*
>        * for each drhd
> @@ -1672,7 +1702,30 @@
>       for_each_drhd_unit(drhd) {
>               if (drhd->ignored)
>                       continue;
> -             iommu = alloc_iommu(drhd);
> +             g_num_of_iommus++;

No locking needed for g_num_of_iommus?

> +     }
> +
> +     nlongs = BITS_TO_LONGS(g_num_of_iommus);

Would this code be neater if it used the <linux/bitmap.h> stuff?

> +     g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> +     if (!g_iommus_to_flush) {
> +             printk(KERN_ERR "Intel-IOMMU: "
> +                     "Allocating bitmap array failed\n");
> +             return -ENOMEM;

Are you sure we aren't leaking anything here?  Like the alloc_iommu() above?

> +     }
> +
> +     g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> +     if (!g_iommus) {
> +             kfree(g_iommus_to_flush);
> +             ret = -ENOMEM;
> +             goto error;
> +     }
> +
> +     i = 0;
> +     for_each_drhd_unit(drhd) {
> +             if (drhd->ignored)
> +                     continue;
> +             iommu = alloc_iommu(&g_iommus[i], drhd);
> +             i++;
>               if (!iommu) {
>                       ret = -ENOMEM;
>                       goto error;
> @@ -1705,7 +1758,6 @@
>        * endfor
>        */
>       for_each_rmrr_units(rmrr) {
> -             int i;
>               for (i = 0; i < rmrr->devices_cnt; i++) {
>                       pdev = rmrr->devices[i];
>                       /* some BIOS lists non-exist devices in DMAR table */
> @@ -1909,6 +1961,54 @@
>       return 0;
>  }
>  
> +static void flush_unmaps(void)
> +{
> +     struct iova *node, *n;
> +     unsigned long flags;
> +     int i;
> +
> +     spin_lock_irqsave(&async_umap_flush_lock, flags);
> +     timer_on = 0;
> +
> +     /*just flush them all*/

I'm surprised that checkpatch didn't grump about the odd commenting style.

> +     for (i = 0; i < g_num_of_iommus; i++) {
> +             if (test_and_clear_bit(i, g_iommus_to_flush))
> +                     iommu_flush_iotlb_global(&g_iommus[i], 0);
> +     }
> +
> +     list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> +             /* free iova */
> +             list_del(&node->list);
> +             __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> +
> +     }
> +     list_size = 0;
> +     spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}
> +
> +static void flush_unmaps_timeout(unsigned long data)
> +{
> +     flush_unmaps();
> +}
> +
> +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> +{
> +     unsigned long flags;
> +
> +     spin_lock_irqsave(&async_umap_flush_lock, flags);

How scalable is this?

> +     iova->dmar = dom;
> +     list_add(&iova->list, &unmaps_to_do);
> +     set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> +
> +     if (!timer_on) {
> +             unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> +             mod_timer(&unmap_timer, unmap_timer.expires);

No, this modifies unmap_timer.expires twice.  Might be racy too.  You want

        mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));

> +             timer_on = 1;
> +     }
> +     list_size++;
> +     spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}
> +
>  static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
>       size_t size, int dir)
>  {
> @@ -1936,13 +2036,21 @@
>       dma_pte_clear_range(domain, start_addr, start_addr + size);
>       /* free page tables */
>       dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> -
> -     if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> -                     size >> PAGE_SHIFT_4K, 0))
> -             iommu_flush_write_buffer(domain->iommu);
> -
> -     /* free iova */
> -     __free_iova(&domain->iovad, iova);
> +     if (intel_iommu_strict) {
> +             if (iommu_flush_iotlb_psi(domain->iommu,
> +                     domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> +                     iommu_flush_write_buffer(domain->iommu);
> +             /* free iova */
> +             __free_iova(&domain->iovad, iova);
> +     } else {
> +             add_unmap(domain, iova);
> +             /*
> +              * queue up the release of the unmap to save the 1/6th of the
> +              * cpu used up by the iotlb flush operation...
> +              */
> +             if (list_size > high_watermark)
> +                     flush_unmaps();
> +     }
>  }
>  
>  static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> @@ -2266,6 +2374,10 @@
>       if (dmar_table_init())
>               return  -ENODEV;
>  
> +     high_watermark = 250;
> +     intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> +     debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> +                                     intel_iommu_debug, &high_watermark);
>       iommu_init_mempool();
>       dmar_init_reserved_ranges();
>  
> @@ -2281,6 +2393,7 @@
>       printk(KERN_INFO
>       "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
>  
> +     init_timer(&unmap_timer);

I see timers being added, but I see no del_timer_sync()s added on cleanup
paths.  Are you sure that we don't have races on various teardown paths?

>       force_iommu = 1;
>       dma_ops = &intel_dma_ops;
>       return 0;
> Index: linux-2.6.24-mm1/drivers/pci/iova.h
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/iova.h  2008-02-12 07:12:06.000000000 
> -0800
> +++ linux-2.6.24-mm1/drivers/pci/iova.h       2008-02-12 07:39:53.000000000 
> -0800
> @@ -23,6 +23,8 @@
>       struct rb_node  node;
>       unsigned long   pfn_hi; /* IOMMU dish out addr hi */
>       unsigned long   pfn_lo; /* IOMMU dish out addr lo */
> +     struct list_head list;
> +     void *dmar;
>  };
>  
>  /* holds all the iova translations for a domain */
> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt 2008-02-12 
> 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt      2008-02-13 
> 11:17:22.000000000 -0800
> @@ -822,6 +822,10 @@
>                       than 32 bit addressing. The default is to look
>                       for translation below 32 bit and if not available
>                       then look in the higher range.
> +             strict [Default Off]
> +                     With this option on every unmap_single operation will
> +                     result in a hardware IOTLB flush operation as opposed
> +                     to batching them for performance.

boot-time options suck.  Is it not possible to tweak this at runtime?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to