On Tue, Jun 23, 2020 at 01:28:36AM +0800, John Garry wrote: > As mentioned in [0], the CPU may consume many cycles processing > arm_smmu_cmdq_issue_cmdlist(). One issue we find is the cmpxchg() loop to > get space on the queue takes approx 25% of the cycles for this function. > > This series removes that cmpxchg().
How about something much simpler like the diff below? Will --->8 diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index f578677a5c41..705d99ec8219 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -560,6 +560,7 @@ struct arm_smmu_cmdq { atomic_long_t *valid_map; atomic_t owner_prod; atomic_t lock; + spinlock_t slock; }; struct arm_smmu_cmdq_batch { @@ -1378,7 +1379,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; - bool owner; + bool owner, cas_failed = false; struct arm_smmu_cmdq *cmdq = &smmu->cmdq; struct arm_smmu_ll_queue llq = { .max_n_shift = cmdq->q.llq.max_n_shift, @@ -1387,27 +1388,35 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, /* 1. Allocate some space in the queue */ local_irq_save(flags); - llq.val = READ_ONCE(cmdq->q.llq.val); do { u64 old; + llq.val = READ_ONCE(cmdq->q.llq.val); - while (!queue_has_space(&llq, n + sync)) { + if (queue_has_space(&llq, n + sync)) + goto try_cas; + + if (cas_failed) + spin_unlock(&cmdq->slock); + + do { local_irq_restore(flags); if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); local_irq_save(flags); - } + } while (!queue_has_space(&llq, n + sync)); +try_cas: head.cons = llq.cons; head.prod = queue_inc_prod_n(&llq, n + sync) | CMDQ_PROD_OWNED_FLAG; old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); - if (old == llq.val) - break; + cas_failed = old != llq.val; + + if (cas_failed) + spin_lock(&cmdq->slock); + } while (cas_failed); - llq.val = old; - } while (1); owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); head.prod &= ~CMDQ_PROD_OWNED_FLAG; llq.prod &= ~CMDQ_PROD_OWNED_FLAG; @@ -3192,6 +3201,7 @@ static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) atomic_set(&cmdq->owner_prod, 0); atomic_set(&cmdq->lock, 0); + spin_lock_init(&cmdq->slock); bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); if (!bitmap) { _______________________________________________ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu