On Thu, 22 Aug 2024 11:37:18 -0700
mhkelle...@gmail.com wrote:

> From: Michael Kelley <mhkli...@outlook.com>
> 
> In a CoCo VM, all DMA-based I/O must use swiotlb bounce buffers
> because DMA cannot be done to private (encrypted) portions of VM
> memory. The bounce buffer memory is marked shared (decrypted) at
> boot time, so I/O is done to/from the bounce buffer memory and then
> copied by the CPU to/from the final target memory (i.e, "bounced").
> Storage devices can be large consumers of bounce buffer memory because
> it is possible to have large numbers of I/Os in flight across multiple
> devices. Bounce buffer memory must be pre-allocated at boot time, and
> it is difficult to know how much memory to allocate to handle peak
> storage I/O loads. Consequently, bounce buffer memory is typically
> over-provisioned, which wastes memory, and may still not avoid a peak
> that exhausts bounce buffer memory and cause storage I/O errors.
> 
> For Coco VMs running with NVMe PCI devices, update the driver to
> permit bounce buffer throttling. Gate the throttling behavior
> on a DMA layer check indicating that throttling is useful, so that
> no change occurs in a non-CoCo VM. If throttling is useful, enable
> the BLK_MQ_F_BLOCKING flag, and pass the DMA_ATTR_MAY_BLOCK attribute
> into dma_map_bvec() and dma_map_sgtable() calls. With these options in
> place, DMA map requests are pended when necessary to reduce the
> likelihood of usage peaks caused by the NVMe driver that could exhaust
> bounce buffer memory and generate errors.
> 
> Signed-off-by: Michael Kelley <mhkli...@outlook.com>
LGTM.

Reviewed-by: Petr Tesarik <ptesa...@suse.com>

Petr T

> ---
>  drivers/nvme/host/pci.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6cd9395ba9ec..2c39943a87f8 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -156,6 +156,7 @@ struct nvme_dev {
>       dma_addr_t host_mem_descs_dma;
>       struct nvme_host_mem_buf_desc *host_mem_descs;
>       void **host_mem_desc_bufs;
> +     unsigned long dma_attrs;
>       unsigned int nr_allocated_queues;
>       unsigned int nr_write_queues;
>       unsigned int nr_poll_queues;
> @@ -735,7 +736,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev 
> *dev,
>       unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
>       unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
>  
> -     iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
> +     iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
> +                                     dev->dma_attrs);
>       if (dma_mapping_error(dev->dev, iod->first_dma))
>               return BLK_STS_RESOURCE;
>       iod->dma_len = bv->bv_len;
> @@ -754,7 +756,8 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev 
> *dev,
>  {
>       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  
> -     iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
> +     iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req),
> +                                     dev->dma_attrs);
>       if (dma_mapping_error(dev->dev, iod->first_dma))
>               return BLK_STS_RESOURCE;
>       iod->dma_len = bv->bv_len;
> @@ -800,7 +803,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
> struct request *req,
>               goto out_free_sg;
>  
>       rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
> -                          DMA_ATTR_NO_WARN);
> +                          dev->dma_attrs | DMA_ATTR_NO_WARN);
>       if (rc) {
>               if (rc == -EREMOTEIO)
>                       ret = BLK_STS_TARGET;
> @@ -828,7 +831,8 @@ static blk_status_t nvme_map_metadata(struct nvme_dev 
> *dev, struct request *req,
>       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>       struct bio_vec bv = rq_integrity_vec(req);
>  
> -     iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
> +     iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req),
> +                                     dev->dma_attrs);
>       if (dma_mapping_error(dev->dev, iod->meta_dma))
>               return BLK_STS_IOERR;
>       cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
> @@ -3040,6 +3044,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct 
> pci_dev *pdev,
>        * a single integrity segment for the separate metadata pointer.
>        */
>       dev->ctrl.max_integrity_segments = 1;
> +
> +     if (dma_recommend_may_block(dev->dev)) {
> +             dev->ctrl.blocking = true;
> +             dev->dma_attrs = DMA_ATTR_MAY_BLOCK;
> +     }
> +
>       return dev;
>  
>  out_put_device:


Reply via email to