Thanks, this looks good to me. Reviewed-by: Alistair Popple <alist...@popple.id.au>
On Thursday, 27 September 2018 4:23:09 PM AEST Mark Hairgrove wrote: > There are two types of ATSDs issued to the NPU: invalidates targeting a > specific virtual address and invalidates targeting the whole address > space. In both cases prior to this change, the sequence was: > > for each NPU > - Write the target address to the XTS_ATSD_AVA register > - EIEIO > - Write the launch value to issue the ATSD > > First, a target address is not required when invalidating the whole > address space, so that write and the EIEIO have been removed. The AP > (size) field in the launch is not needed either. > > Second, for per-address invalidates the above sequence is inefficient in > the common case of multiple NPUs because an EIEIO is issued per NPU. This > unnecessarily forces the launches of later ATSDs to be ordered with the > launches of earlier ones. The new sequence only issues a single EIEIO: > > for each NPU > - Write the target address to the XTS_ATSD_AVA register > EIEIO > for each NPU > - Write the launch value to issue the ATSD > > Performance results were gathered using a microbenchmark which creates a > 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in > strides across the allocation. > > With only a single NPU active (one GPU) the difference is in the noise for > both types of invalidates (+/-1%). > > With two NPUs active (on a 6-GPU system) the effect is more noticeable: > > mprotect rate (GB/s) > Stride Before After Speedup > 64K 5.9 6.5 10% > 1M 31.2 33.4 7% > 2M 36.3 38.7 7% > 4M 322.6 356.7 11% > > Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com> > --- > arch/powerpc/platforms/powernv/npu-dma.c | 99 ++++++++++++++--------------- > 1 files changed, 48 insertions(+), 51 deletions(-) > > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index 8006c54..c8f438a 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) > } > > /* MMIO ATSD register offsets */ > -#define XTS_ATSD_AVA 1 > -#define XTS_ATSD_STAT 2 > +#define XTS_ATSD_LAUNCH 0 > +#define XTS_ATSD_AVA 1 > +#define XTS_ATSD_STAT 2 > > -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, > - unsigned long launch, unsigned long va) > +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long > psize, > + bool flush) > { > - struct npu *npu = mmio_atsd_reg->npu; > - int reg = mmio_atsd_reg->reg; > + unsigned long launch = 0; > > - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); > - eieio(); > - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]); > + if (psize == MMU_PAGE_COUNT) { > + /* IS set to invalidate entire matching PID */ > + launch |= PPC_BIT(12); > + } else { > + /* AP set to invalidate region of psize */ > + launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17); > + } > + > + /* PRS set to process-scoped */ > + launch |= PPC_BIT(13); > + > + /* PID */ > + launch |= pid << PPC_BITLSHIFT(38); > + > + /* No flush */ > + launch |= !flush << PPC_BITLSHIFT(39); > + > + return launch; > } > > -static void mmio_invalidate_pid(struct mmio_atsd_reg > mmio_atsd_reg[NV_MAX_NPUS], > - unsigned long pid, bool flush) > +static void mmio_atsd_regs_write(struct mmio_atsd_reg > + mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset, > + unsigned long val) > { > - int i; > - unsigned long launch; > + struct npu *npu; > + int i, reg; > > for (i = 0; i <= max_npu2_index; i++) { > - if (mmio_atsd_reg[i].reg < 0) > + reg = mmio_atsd_reg[i].reg; > + if (reg < 0) > continue; > > - /* IS set to invalidate matching PID */ > - launch = PPC_BIT(12); > - > - /* PRS set to process-scoped */ > - launch |= PPC_BIT(13); > - > - /* AP */ > - launch |= (u64) > - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > - > - /* PID */ > - launch |= pid << PPC_BITLSHIFT(38); > + npu = mmio_atsd_reg[i].npu; > + __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset); > + } > +} > > - /* No flush */ > - launch |= !flush << PPC_BITLSHIFT(39); > +static void mmio_invalidate_pid(struct mmio_atsd_reg > mmio_atsd_reg[NV_MAX_NPUS], > + unsigned long pid, bool flush) > +{ > + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush); > > - /* Invalidating the entire process doesn't use a va */ > - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); > - } > + /* Invalidating the entire process doesn't use a va */ > + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); > } > > static void mmio_invalidate_va(struct mmio_atsd_reg > mmio_atsd_reg[NV_MAX_NPUS], > unsigned long va, unsigned long pid, bool flush) > { > - int i; > unsigned long launch; > > - for (i = 0; i <= max_npu2_index; i++) { > - if (mmio_atsd_reg[i].reg < 0) > - continue; > - > - /* IS set to invalidate target VA */ > - launch = 0; > + launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush); > > - /* PRS set to process scoped */ > - launch |= PPC_BIT(13); > + /* Write all VAs first */ > + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va); > > - /* AP */ > - launch |= (u64) > - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); > - > - /* PID */ > - launch |= pid << PPC_BITLSHIFT(38); > - > - /* No flush */ > - launch |= !flush << PPC_BITLSHIFT(39); > + /* Issue one barrier for all address writes */ > + eieio(); > > - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va); > - } > + /* Launch */ > + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); > } > > #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) >