I will test as soon as I can, just getting ready for summer vacation right
now
On Jun 8, 2017 14:34, "Jan Vesely" wrote:
> On Tue, 2017-06-06 at 10:02 +, Nath, Arindam wrote:
> > > -Original Message-
> > > From: Lendacky, Thomas
> > > Sent: Tuesday, June 06, 2017 1:23 AM
> > > To: iommu@lists.linux-foundation.org
> > > Cc: Nath, Arindam ; Joerg Roedel
> > > ; Duran, Leo ; Suthikulpanit,
> > > Suravee
> > > Subject: [PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush
> > >
> > > After reducing the amount of MMIO performed by the IOMMU during
> > > operation,
> > > perf data shows that flushing the TLB for all protection domains during
> > > DMA unmapping is a performance issue. It is not necessary to flush the
> > > TLBs for all protection domains, only the protection domains associated
> > > with iova's on the flush queue.
> > >
> > > Create a separate queue that tracks the protection domains associated
> with
> > > the iova's on the flush queue. This new queue optimizes the flushing of
> > > TLBs to the required protection domains.
> > >
> > > Reviewed-by: Arindam Nath
> > > Signed-off-by: Tom Lendacky
> > > ---
> > > drivers/iommu/amd_iommu.c | 56
> > > -
> > > 1 file changed, 50 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> > > index 856103b..a5e77f0 100644
> > > --- a/drivers/iommu/amd_iommu.c
> > > +++ b/drivers/iommu/amd_iommu.c
> > > @@ -103,7 +103,18 @@ struct flush_queue {
> > > struct flush_queue_entry *entries;
> > > };
> > >
> > > +struct flush_pd_queue_entry {
> > > + struct protection_domain *pd;
> > > +};
> > > +
> > > +struct flush_pd_queue {
> > > + /* No lock needed, protected by flush_queue lock */
> > > + unsigned next;
> > > + struct flush_pd_queue_entry *entries;
> > > +};
> > > +
> > > static DEFINE_PER_CPU(struct flush_queue, flush_queue);
> > > +static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
> > >
> > > static atomic_t queue_timer_on;
> > > static struct timer_list queue_timer;
> > > @@ -2227,16 +2238,20 @@ static struct iommu_group
> > > *amd_iommu_device_group(struct device *dev)
> > > *
> > >
> > > ***
> > > **/
> > >
> > > -static void __queue_flush(struct flush_queue *queue)
> > > +static void __queue_flush(struct flush_queue *queue,
> > > + struct flush_pd_queue *pd_queue)
> > > {
> > > - struct protection_domain *domain;
> > > unsigned long flags;
> > > int idx;
> > >
> > > /* First flush TLB of all known domains */
> > > spin_lock_irqsave(&amd_iommu_pd_lock, flags);
> > > - list_for_each_entry(domain, &amd_iommu_pd_list, list)
> > > - domain_flush_tlb(domain);
> > > + for (idx = 0; idx < pd_queue->next; ++idx) {
> > > + struct flush_pd_queue_entry *entry;
> > > +
> > > + entry = pd_queue->entries + idx;
> > > + domain_flush_tlb(entry->pd);
> > > + }
> > > spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
> > >
> > > /* Wait until flushes have completed */
> > > @@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue
> > > *queue)
> > > entry->dma_dom = NULL;
> > > }
> > >
> > > + pd_queue->next = 0;
> > > queue->next = 0;
> > > }
> > >
> > > @@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
> > > int cpu;
> > >
> > > for_each_possible_cpu(cpu) {
> > > + struct flush_pd_queue *pd_queue;
> > > struct flush_queue *queue;
> > > unsigned long flags;
> > >
> > > queue = per_cpu_ptr(&flush_queue, cpu);
> > > + pd_queue = per_cpu_ptr(&flush_pd_queue, cpu);
> > > spin_lock_irqsave(&queue->lock, flags);
> > > if (queue->next > 0)
> > > - __queue_flush(queue);
> > > + __queue_flush(queue, pd_queue);
> > > spin_unlock_irqrestore(&queue->lock, flags);
> > > }
> > > }
> > > @@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long
> > > unsused)
> > > static void queue_add(struct dma_ops_domain *dma_dom,
> > > unsigned long address, unsigned long pages)
> > > {
> > > + struct flush_pd_queue_entry *pd_entry;
> > > + struct flush_pd_queue *pd_queue;
> > > struct flush_queue_entry *entry;
> > > struct flush_queue *queue;
> > > unsigned long flags;
> > > @@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain
> > > *dma_dom,
> > > address >>= PAGE_SHIFT;
> > >
> > > queue = get_cpu_ptr(&flush_queue);
> > > + pd_queue = get_cpu_ptr(&flush_pd_queue);
> > > spin_lock_irqsave(&queue->lock, flags);
> > >
> > > if (queue->next == FLUSH_QUEUE_SIZE)
> > > - __queue_flush(queue);
> > > + __queue_flush(queue, pd_queue);
> > > +
> > > + for (idx = 0; idx < pd_queue->next; ++idx) {
> > > + pd_entry =