Re: [PATCH net v3] virtio_net: Fix error unwinding of XDP initialization
在 2023/5/9 09:43, Xuan Zhuo 写道: On Mon, 8 May 2023 11:00:10 -0400, Feng Liu wrote: On 2023-05-07 p.m.9:45, Xuan Zhuo wrote: External email: Use caution opening links or attachments On Sat, 6 May 2023 08:08:02 -0400, Feng Liu wrote: On 2023-05-05 p.m.10:33, Xuan Zhuo wrote: External email: Use caution opening links or attachments On Tue, 2 May 2023 20:35:25 -0400, Feng Liu wrote: When initializing XDP in virtnet_open(), some rq xdp initialization may hit an error causing net device open failed. However, previous rqs have already initialized XDP and enabled NAPI, which is not the expected behavior. Need to roll back the previous rq initialization to avoid leaks in error unwinding of init code. Also extract a helper function of disable queue pairs, and use newly introduced helper function in error unwinding and virtnet_close; Issue: 3383038 Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info") Signed-off-by: Feng Liu Reviewed-by: William Tu Reviewed-by: Parav Pandit Reviewed-by: Simon Horman Acked-by: Michael S. Tsirkin Change-Id: Ib4c6a97cb7b837cfa484c593dd43a435c47ea68f --- drivers/net/virtio_net.c | 30 -- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8d8038538fc4..3737cf120cb7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1868,6 +1868,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) return received; } +static void virtnet_disable_qp(struct virtnet_info *vi, int qp_index) +{ + virtnet_napi_tx_disable(&vi->sq[qp_index].napi); + napi_disable(&vi->rq[qp_index].napi); + xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); +} + static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); @@ -1883,20 +1890,26 @@ static int virtnet_open(struct net_device *dev) err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id); if (err < 0) - return err; + goto err_xdp_info_reg; err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); - if (err < 0) { - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); - return err; - } + if (err < 0) + goto err_xdp_reg_mem_model; virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); } return 0; + +err_xdp_reg_mem_model: + xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); +err_xdp_info_reg: + for (i = i - 1; i >= 0; i--) + virtnet_disable_qp(vi, i); I would to know should we handle for these: disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); Maybe we should call virtnet_close() with "i" directly. Thanks. Can’t use i directly here, because if xdp_rxq_info_reg fails, napi has not been enabled for current qp yet, I should roll back from the queue pairs where napi was enabled before(i--), otherwise it will hang at napi disable api This is not the point, the key is whether we should handle with: disable_delayed_refill(vi); cancel_delayed_work_sync(&vi->refill); Thanks. OK, get the point. Thanks for your careful review. And I check the code again. There are two points that I need to explain: 1. All refill delay work calls(vi->refill, vi->refill_enabled) are based on that the virtio interface is successfully opened, such as virtnet_receive, virtnet_rx_resize, _virtnet_set_queues, etc. If there is an error in the xdp reg here, it will not trigger these subsequent functions. There is no need to call disable_delayed_refill() and cancel_delayed_work_sync(). Maybe something is wrong. I think these lines may call delay work. static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int i, err; enable_delayed_refill(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (i < vi->curr_queue_pairs) /* Make sure we have some buffers: if oom use wq. */ --> if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) --> schedule_delayed_work(&vi->refill, 0); err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id); if (err < 0) return err; err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); return err; } virtnet_napi_enable(vi->rq[i].vq, &vi-
Re: [PATCH] vdpa/snet: implement the resume vDPA callback
On Tue, May 2, 2023 at 9:11 PM Alvaro Karsz wrote: > > The callback sends a resume command to the DPU through > the control mechanism. > > Signed-off-by: Alvaro Karsz Acked-by: Jason Wang Thanks > --- > drivers/vdpa/solidrun/snet_ctrl.c | 6 ++ > drivers/vdpa/solidrun/snet_main.c | 15 +++ > drivers/vdpa/solidrun/snet_vdpa.h | 1 + > 3 files changed, 22 insertions(+) > > diff --git a/drivers/vdpa/solidrun/snet_ctrl.c > b/drivers/vdpa/solidrun/snet_ctrl.c > index 3858738643b..3cef2571d15 100644 > --- a/drivers/vdpa/solidrun/snet_ctrl.c > +++ b/drivers/vdpa/solidrun/snet_ctrl.c > @@ -16,6 +16,7 @@ enum snet_ctrl_opcodes { > SNET_CTRL_OP_DESTROY = 1, > SNET_CTRL_OP_READ_VQ_STATE, > SNET_CTRL_OP_SUSPEND, > + SNET_CTRL_OP_RESUME, > }; > > #define SNET_CTRL_TIMEOUT 200 > @@ -328,3 +329,8 @@ int snet_suspend_dev(struct snet *snet) > { > return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0); > } > + > +int snet_resume_dev(struct snet *snet) > +{ > + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_RESUME, 0); > +} > diff --git a/drivers/vdpa/solidrun/snet_main.c > b/drivers/vdpa/solidrun/snet_main.c > index cdcd84ce4f5..99428a04068 100644 > --- a/drivers/vdpa/solidrun/snet_main.c > +++ b/drivers/vdpa/solidrun/snet_main.c > @@ -509,6 +509,20 @@ static int snet_suspend(struct vdpa_device *vdev) > return ret; > } > > +static int snet_resume(struct vdpa_device *vdev) > +{ > + struct snet *snet = vdpa_to_snet(vdev); > + int ret; > + > + ret = snet_resume_dev(snet); > + if (ret) > + SNET_ERR(snet->pdev, "SNET[%u] resume failed, err: %d\n", > snet->sid, ret); > + else > + SNET_DBG(snet->pdev, "Resume SNET[%u] device\n", snet->sid); > + > + return ret; > +} > + > static const struct vdpa_config_ops snet_config_ops = { > .set_vq_address = snet_set_vq_address, > .set_vq_num = snet_set_vq_num, > @@ -536,6 +550,7 @@ static const struct vdpa_config_ops snet_config_ops = { > .get_config = snet_get_config, > .set_config = snet_set_config, > .suspend= snet_suspend, > + .resume = snet_resume, > }; > > static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) > diff --git a/drivers/vdpa/solidrun/snet_vdpa.h > b/drivers/vdpa/solidrun/snet_vdpa.h > index 3c78d4e7d48..36ac285835e 100644 > --- a/drivers/vdpa/solidrun/snet_vdpa.h > +++ b/drivers/vdpa/solidrun/snet_vdpa.h > @@ -204,5 +204,6 @@ void snet_ctrl_clear(struct snet *snet); > int snet_destroy_dev(struct snet *snet); > int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state > *state); > int snet_suspend_dev(struct snet *snet); > +int snet_resume_dev(struct snet *snet); > > #endif //_SNET_VDPA_H_ > -- > 2.34.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2] vhost_net: revert upend_idx only on retriable error
On Tue, Apr 25, 2023 at 4:44 AM Andrey Smetanin wrote: > > Fix possible virtqueue used buffers leak and corresponding stuck > in case of temporary -EIO from sendmsg() which is produced by > tun driver while backend device is not up. > > In case of no-retriable error and zcopy do not revert upend_idx > to pass packet data (that is update used_idx in corresponding > vhost_zerocopy_signal_used()) as if packet data has been > transferred successfully. > > v2: set vq->heads[ubuf->desc].len equal to VHOST_DMA_DONE_LEN > in case of fake successful transmit. > > Signed-off-by: Andrey Smetanin Acked-by: Jason Wang Thanks > --- > drivers/vhost/net.c | 11 --- > 1 file changed, 8 insertions(+), 3 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 20265393aee7..0791fbdb3975 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -934,13 +934,18 @@ static void handle_tx_zerocopy(struct vhost_net *net, > struct socket *sock) > > err = sock->ops->sendmsg(sock, &msg, len); > if (unlikely(err < 0)) { > + bool retry = err == -EAGAIN || err == -ENOMEM || err > == -ENOBUFS; > + > if (zcopy_used) { > if (vq->heads[ubuf->desc].len == > VHOST_DMA_IN_PROGRESS) > vhost_net_ubuf_put(ubufs); > - nvq->upend_idx = ((unsigned)nvq->upend_idx - > 1) > - % UIO_MAXIOV; > + if (retry) > + nvq->upend_idx = > ((unsigned)nvq->upend_idx - 1) > + % UIO_MAXIOV; > + else > + vq->heads[ubuf->desc].len = > VHOST_DMA_DONE_LEN; > } > - if (err == -EAGAIN || err == -ENOMEM || err == > -ENOBUFS) { > + if (retry) { > vhost_discard_vq_desc(vq, 1); > vhost_net_enable_vq(net, vq); > break; > -- > 2.25.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue
On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin wrote: > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote: > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin wrote: > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote: > > > > Forget to cc netdev, adding. > > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote: > > > > > > This patch convert rx mode setting to be done in a workqueue, this > > > > > > is > > > > > > a must for allow to sleep when waiting for the cvq command to > > > > > > response since current code is executed under addr spin lock. > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > I don't like this frankly. This means that setting RX mode which would > > > > > previously be reliable, now becomes unreliable. > > > > > > > > It is "unreliable" by design: > > > > > > > > void(*ndo_set_rx_mode)(struct net_device > > > > *dev); > > > > > > > > > - first of all configuration is no longer immediate > > > > > > > > Is immediate a hard requirement? I can see a workqueue is used at least: > > > > > > > > mlx5e, ipoib, efx, ... > > > > > > > > > and there is no way for driver to find out when > > > > > it actually took effect > > > > > > > > But we know rx mode is best effort e.g it doesn't support vhost and we > > > > survive from this for years. > > > > > > > > > - second, if device fails command, this is also not > > > > > propagated to driver, again no way for driver to find out > > > > > > > > > > VDUSE needs to be fixed to do tricks to fix this > > > > > without breaking normal drivers. > > > > > > > > It's not specific to VDUSE. For example, when using virtio-net in the > > > > UP environment with any software cvq (like mlx5 via vDPA or cma > > > > transport). > > > > > > > > Thanks > > > > > > Hmm. Can we differentiate between these use-cases? > > > > It doesn't look easy since we are drivers for virtio bus. Underlayer > > details were hidden from virtio-net. > > > > Or do you have any ideas on this? > > > > Thanks > > I don't know, pass some kind of flag in struct virtqueue? > "bool slow; /* This vq can be very slow sometimes. Don't wait for it! > */" > > ? > So if it's slow, sleep, otherwise poll? I feel setting this flag might be tricky, since the driver doesn't know whether or not it's really slow. E.g smartNIC vendor may allow virtio-net emulation over PCI. Thanks > -- > MST > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net v6] virtio_net: Fix error unwinding of XDP initialization
On Fri, May 12, 2023 at 11:18 PM Feng Liu wrote: > > When initializing XDP in virtnet_open(), some rq xdp initialization > may hit an error causing net device open failed. However, previous > rqs have already initialized XDP and enabled NAPI, which is not the > expected behavior. Need to roll back the previous rq initialization > to avoid leaks in error unwinding of init code. > > Also extract helper functions of disable and enable queue pairs. > Use newly introduced disable helper function in error unwinding and > virtnet_close. Use enable helper function in virtnet_open. > > Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info") > Signed-off-by: Feng Liu > Reviewed-by: Jiri Pirko > Reviewed-by: William Tu Acked-by: Jason Wang Thanks > --- > v5 -> v6 > feedbacks from Xuan Zhuo > - add disable_delayed_refill and cancel_delayed_work_sync > > v4 -> v5 > feedbacks from Michael S. Tsirkin > - rename helper as virtnet_disable_queue_pair > - rename helper as virtnet_enable_queue_pair > > v3 -> v4 > feedbacks from Jiri Pirko > - Add symmetric helper function virtnet_enable_qp to enable queues. > - Error handle: cleanup current queue pair in virtnet_enable_qp, > and complete the reset queue pairs cleanup in virtnet_open. > - Fix coding style. > feedbacks from Parav Pandit > - Remove redundant debug message and white space. > > v2 -> v3 > feedbacks from Michael S. Tsirkin > - Remove redundant comment. > > v1 -> v2 > feedbacks from Michael S. Tsirkin > - squash two patches together. > > --- > drivers/net/virtio_net.c | 61 +--- > 1 file changed, 44 insertions(+), 17 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index a12ae26db0e2..56ca1d270304 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -1868,6 +1868,38 @@ static int virtnet_poll(struct napi_struct *napi, int > budget) > return received; > } > > +static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) > +{ > + virtnet_napi_tx_disable(&vi->sq[qp_index].napi); > + napi_disable(&vi->rq[qp_index].napi); > + xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); > +} > + > +static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index) > +{ > + struct net_device *dev = vi->dev; > + int err; > + > + err = xdp_rxq_info_reg(&vi->rq[qp_index].xdp_rxq, dev, qp_index, > + vi->rq[qp_index].napi.napi_id); > + if (err < 0) > + return err; > + > + err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq, > +MEM_TYPE_PAGE_SHARED, NULL); > + if (err < 0) > + goto err_xdp_reg_mem_model; > + > + virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi); > + virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, > &vi->sq[qp_index].napi); > + > + return 0; > + > +err_xdp_reg_mem_model: > + xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); > + return err; > +} > + > static int virtnet_open(struct net_device *dev) > { > struct virtnet_info *vi = netdev_priv(dev); > @@ -1881,22 +1913,20 @@ static int virtnet_open(struct net_device *dev) > if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) > schedule_delayed_work(&vi->refill, 0); > > - err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, > vi->rq[i].napi.napi_id); > + err = virtnet_enable_queue_pair(vi, i); > if (err < 0) > - return err; > - > - err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, > -MEM_TYPE_PAGE_SHARED, NULL); > - if (err < 0) { > - xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); > - return err; > - } > - > - virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); > - virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); > + goto err_enable_qp; > } > > return 0; > + > +err_enable_qp: > + disable_delayed_refill(vi); > + cancel_delayed_work_sync(&vi->refill); > + > + for (i--; i >= 0; i--) > + virtnet_disable_queue_pair(vi, i); > + return err; > } > > static int virtnet_poll_tx(struct napi_
Re: [PATCH vhost v8 01/12] virtio_ring: split: separate dma codes
On Fri, May 12, 2023 at 11:27 PM Christoph Hellwig wrote: > > As said before, please don't try to do weird runtime checks based > on the scatterlist. What you have works for now, but there are > plans to repalce the page + offset tuple in the scatterlist with > just a phys_addr_t. And with that your "clever" scheme will break > instantly. > Xuan, I think we probably need to go back to your original method that is having a dedicated flag and helper for pre mapped buffers. Thanks ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v5 virtio 01/11] virtio: allow caller to override device id in vp_modern
在 2023/5/4 02:12, Shannon Nelson 写道: To add a bit of vendor flexibility with various virtio based devices, allow the caller to check for a different device id. This adds a function pointer field to struct virtio_pci_modern_device to specify an override device id check. If defined by the driver, this function will be called to check that the PCI device is the vendor's expected device, and will return the found device id to be stored in mdev->id.device. This allows vendors with alternative vendor device ids to use this library on their own device BAR. Note: A lot of the diff in this is simply indenting the existing code into an else block. Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks --- drivers/virtio/virtio_pci_modern_dev.c | 30 -- include/linux/virtio_pci_modern.h | 3 +++ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 869cb46bef96..9b2d6614de67 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -218,21 +218,29 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) int err, common, isr, notify, device; u32 notify_length; u32 notify_offset; + int devid; check_offsets(); - /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ - if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) - return -ENODEV; - - if (pci_dev->device < 0x1040) { - /* Transitional devices: use the PCI subsystem device id as -* virtio device id, same as legacy driver always did. -*/ - mdev->id.device = pci_dev->subsystem_device; + if (mdev->device_id_check) { + devid = mdev->device_id_check(pci_dev); + if (devid < 0) + return devid; + mdev->id.device = devid; } else { - /* Modern devices: simply use PCI device id, but start from 0x1040. */ - mdev->id.device = pci_dev->device - 0x1040; + /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ + if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) + return -ENODEV; + + if (pci_dev->device < 0x1040) { + /* Transitional devices: use the PCI subsystem device id as +* virtio device id, same as legacy driver always did. +*/ + mdev->id.device = pci_dev->subsystem_device; + } else { + /* Modern devices: simply use PCI device id, but start from 0x1040. */ + mdev->id.device = pci_dev->device - 0x1040; + } } mdev->id.vendor = pci_dev->subsystem_vendor; diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index c4eeb79b0139..e7b1db1dd0bb 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -38,6 +38,9 @@ struct virtio_pci_modern_device { int modern_bars; struct virtio_device_id id; + + /* optional check for vendor virtio device, returns dev_id or -ERRNO */ + int (*device_id_check)(struct pci_dev *pdev); }; /* ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v5 virtio 02/11] virtio: allow caller to override device DMA mask in vp_modern
在 2023/5/4 02:12, Shannon Nelson 写道: To add a bit of vendor flexibility with various virtio based devices, allow the caller to specify a different DMA mask. This adds a dma_mask field to struct virtio_pci_modern_device. If defined by the driver, this mask will be used in a call to dma_set_mask_and_coherent() instead of the traditional DMA_BIT_MASK(64). This allows limiting the DMA space on vendor devices with address limitations. Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks --- drivers/virtio/virtio_pci_modern_dev.c | 3 ++- include/linux/virtio_pci_modern.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 9b2d6614de67..aad7d9296e77 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -268,7 +268,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) return -EINVAL; } - err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + err = dma_set_mask_and_coherent(&pci_dev->dev, + mdev->dma_mask ? : DMA_BIT_MASK(64)); if (err) err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32)); diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index e7b1db1dd0bb..067ac1d789bc 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -41,6 +41,9 @@ struct virtio_pci_modern_device { /* optional check for vendor virtio device, returns dev_id or -ERRNO */ int (*device_id_check)(struct pci_dev *pdev); + + /* optional mask for devices with limited DMA space */ + u64 dma_mask; }; /* ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v5 virtio 04/11] pds_vdpa: move enum from common to adminq header
在 2023/5/4 02:12, Shannon Nelson 写道: The pds_core_logical_qtype enum and IFNAMSIZ are not needed in the common PDS header, only needed when working with the adminq, so move them to the adminq header. Note: This patch might conflict with pds_vfio patches that are in review, depending on which patchset gets pulled first. Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks --- include/linux/pds/pds_adminq.h | 21 + include/linux/pds/pds_common.h | 21 - 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 98a60ce87b92..61b0a8634e1a 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -222,6 +222,27 @@ enum pds_core_lif_type { PDS_CORE_LIF_TYPE_DEFAULT = 0, }; +#define PDS_CORE_IFNAMSIZ 16 + +/** + * enum pds_core_logical_qtype - Logical Queue Types + * @PDS_CORE_QTYPE_ADMINQ:Administrative Queue + * @PDS_CORE_QTYPE_NOTIFYQ: Notify Queue + * @PDS_CORE_QTYPE_RXQ: Receive Queue + * @PDS_CORE_QTYPE_TXQ: Transmit Queue + * @PDS_CORE_QTYPE_EQ:Event Queue + * @PDS_CORE_QTYPE_MAX: Max queue type supported + */ +enum pds_core_logical_qtype { + PDS_CORE_QTYPE_ADMINQ = 0, + PDS_CORE_QTYPE_NOTIFYQ = 1, + PDS_CORE_QTYPE_RXQ = 2, + PDS_CORE_QTYPE_TXQ = 3, + PDS_CORE_QTYPE_EQ = 4, + + PDS_CORE_QTYPE_MAX = 16 /* don't change - used in struct size */ +}; + /** * union pds_core_lif_config - LIF configuration * @state:LIF state (enum pds_core_lif_state) diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index 2a0d1669cfd0..435c8e8161c2 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -41,27 +41,6 @@ enum pds_core_vif_types { #define PDS_VDPA_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_VDPA_STR -#define PDS_CORE_IFNAMSIZ 16 - -/** - * enum pds_core_logical_qtype - Logical Queue Types - * @PDS_CORE_QTYPE_ADMINQ:Administrative Queue - * @PDS_CORE_QTYPE_NOTIFYQ: Notify Queue - * @PDS_CORE_QTYPE_RXQ: Receive Queue - * @PDS_CORE_QTYPE_TXQ: Transmit Queue - * @PDS_CORE_QTYPE_EQ:Event Queue - * @PDS_CORE_QTYPE_MAX: Max queue type supported - */ -enum pds_core_logical_qtype { - PDS_CORE_QTYPE_ADMINQ = 0, - PDS_CORE_QTYPE_NOTIFYQ = 1, - PDS_CORE_QTYPE_RXQ = 2, - PDS_CORE_QTYPE_TXQ = 3, - PDS_CORE_QTYPE_EQ = 4, - - PDS_CORE_QTYPE_MAX = 16 /* don't change - used in struct size */ -}; - int pdsc_register_notify(struct notifier_block *nb); void pdsc_unregister_notify(struct notifier_block *nb); void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v5 virtio 05/11] pds_vdpa: new adminq entries
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > Add new adminq definitions in support for vDPA operations. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > include/linux/pds/pds_adminq.h | 266 + > 1 file changed, 266 insertions(+) > > diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h > index 61b0a8634e1a..c66ead725434 100644 > --- a/include/linux/pds/pds_adminq.h > +++ b/include/linux/pds/pds_adminq.h > @@ -605,6 +605,257 @@ struct pds_core_q_init_comp { > u8 color; > }; > > +/* > + * enum pds_vdpa_cmd_opcode - vDPA Device commands > + */ > +enum pds_vdpa_cmd_opcode { > + PDS_VDPA_CMD_INIT = 48, > + PDS_VDPA_CMD_IDENT = 49, > + PDS_VDPA_CMD_RESET = 51, > + PDS_VDPA_CMD_VQ_RESET = 52, > + PDS_VDPA_CMD_VQ_INIT= 53, > + PDS_VDPA_CMD_STATUS_UPDATE = 54, > + PDS_VDPA_CMD_SET_FEATURES = 55, > + PDS_VDPA_CMD_SET_ATTR = 56, > + PDS_VDPA_CMD_VQ_SET_STATE = 57, > + PDS_VDPA_CMD_VQ_GET_STATE = 58, > +}; > + > +/** > + * struct pds_vdpa_cmd - generic command > + * @opcode:Opcode > + * @vdpa_index:Index for vdpa subdevice > + * @vf_id: VF id > + */ > +struct pds_vdpa_cmd { > + u8 opcode; > + u8 vdpa_index; > + __le16 vf_id; > +}; > + > +/** > + * struct pds_vdpa_init_cmd - INIT command > + * @opcode:Opcode PDS_VDPA_CMD_INIT > + * @vdpa_index: Index for vdpa subdevice > + * @vf_id: VF id > + */ > +struct pds_vdpa_init_cmd { > + u8 opcode; > + u8 vdpa_index; > + __le16 vf_id; > +}; > + > +/** > + * struct pds_vdpa_ident - vDPA identification data > + * @hw_features: vDPA features supported by device > + * @max_vqs: max queues available (2 queues for a single queuepair) > + * @max_qlen: log(2) of maximum number of descriptors > + * @min_qlen: log(2) of minimum number of descriptors > + * > + * This struct is used in a DMA block that is set up for the > PDS_VDPA_CMD_IDENT > + * transaction. Set up the DMA block and send the address in the IDENT cmd > + * data, the DSC will write the ident information, then we can remove the DMA > + * block after reading the answer. If the completion status is 0, then there > + * is valid information, else there was an error and the data should be > invalid. > + */ > +struct pds_vdpa_ident { > + __le64 hw_features; > + __le16 max_vqs; > + __le16 max_qlen; > + __le16 min_qlen; > +}; > + > +/** > + * struct pds_vdpa_ident_cmd - IDENT command > + * @opcode:Opcode PDS_VDPA_CMD_IDENT > + * @rsvd: Word boundary padding > + * @vf_id: VF id > + * @len: length of ident info DMA space > + * @ident_pa: address for DMA of ident info (struct pds_vdpa_ident) > + * only used for this transaction, then forgotten by DSC > + */ > +struct pds_vdpa_ident_cmd { > + u8 opcode; > + u8 rsvd; > + __le16 vf_id; > + __le32 len; > + __le64 ident_pa; > +}; > + > +/** > + * struct pds_vdpa_status_cmd - STATUS_UPDATE command > + * @opcode:Opcode PDS_VDPA_CMD_STATUS_UPDATE > + * @vdpa_index: Index for vdpa subdevice > + * @vf_id: VF id > + * @status:new status bits > + */ > +struct pds_vdpa_status_cmd { > + u8 opcode; > + u8 vdpa_index; > + __le16 vf_id; > + u8 status; > +}; > + > +/** > + * enum pds_vdpa_attr - List of VDPA device attributes > + * @PDS_VDPA_ATTR_MAC: MAC address > + * @PDS_VDPA_ATTR_MAX_VQ_PAIRS: Max virtqueue pairs > + */ > +enum pds_vdpa_attr { > + PDS_VDPA_ATTR_MAC = 1, > + PDS_VDPA_ATTR_MAX_VQ_PAIRS = 2, > +}; > + > +/** > + * struct pds_vdpa_setattr_cmd - SET_ATTR command > + * @opcode:Opcode PDS_VDPA_CMD_SET_ATTR > + * @vdpa_index:Index for vdpa subdevice > + * @vf_id: VF id > + * @attr: attribute to be changed (enum pds_vdpa_attr) > + * @pad: Word boundary padding > + * @mac: new mac address to be assigned as vdpa device address > + * @max_vq_pairs: new limit of virtqueue pairs > + */ > +struct pds_vdpa_setattr_cmd { > + u8 opcode; > + u8 vdpa_index; > + __le16 vf_id; > + u8 attr; > + u8 pad[3]; > + union { > + u8 mac[6]; > + __le16 max_vq_pairs; > + } __packe
Re: [PATCH v5 virtio 07/11] pds_vdpa: virtio bar setup for vdpa
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > Prep and use the "modern" virtio bar utilities to get our > virtio config space ready. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > drivers/vdpa/pds/aux_drv.c | 25 + > drivers/vdpa/pds/aux_drv.h | 3 +++ > 2 files changed, 28 insertions(+) > > diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c > index aa748cf55d2b..0c4a135b1484 100644 > --- a/drivers/vdpa/pds/aux_drv.c > +++ b/drivers/vdpa/pds/aux_drv.c > @@ -4,6 +4,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -19,12 +20,22 @@ static const struct auxiliary_device_id > pds_vdpa_id_table[] = { > {}, > }; > > +static int pds_vdpa_device_id_check(struct pci_dev *pdev) > +{ > + if (pdev->device != PCI_DEVICE_ID_PENSANDO_VDPA_VF || > + pdev->vendor != PCI_VENDOR_ID_PENSANDO) > + return -ENODEV; > + > + return PCI_DEVICE_ID_PENSANDO_VDPA_VF; > +} > + > static int pds_vdpa_probe(struct auxiliary_device *aux_dev, > const struct auxiliary_device_id *id) > > { > struct pds_auxiliary_dev *padev = > container_of(aux_dev, struct pds_auxiliary_dev, aux_dev); > + struct device *dev = &aux_dev->dev; > struct pds_vdpa_aux *vdpa_aux; > int err; > > @@ -41,8 +52,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev, > if (err) > goto err_free_mem; > > + /* Find the virtio configuration */ > + vdpa_aux->vd_mdev.pci_dev = padev->vf_pdev; > + vdpa_aux->vd_mdev.device_id_check = pds_vdpa_device_id_check; > + vdpa_aux->vd_mdev.dma_mask = DMA_BIT_MASK(PDS_CORE_ADDR_LEN); > + err = vp_modern_probe(&vdpa_aux->vd_mdev); > + if (err) { > + dev_err(dev, "Unable to probe for virtio configuration: > %pe\n", > + ERR_PTR(err)); > + goto err_free_mgmt_info; > + } > + > return 0; > > +err_free_mgmt_info: > + pci_free_irq_vectors(padev->vf_pdev); > err_free_mem: > kfree(vdpa_aux); > auxiliary_set_drvdata(aux_dev, NULL); > @@ -55,6 +79,7 @@ static void pds_vdpa_remove(struct auxiliary_device > *aux_dev) > struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev); > struct device *dev = &aux_dev->dev; > > + vp_modern_remove(&vdpa_aux->vd_mdev); > pci_free_irq_vectors(vdpa_aux->padev->vf_pdev); > > kfree(vdpa_aux); > diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h > index dcec782e79eb..99e0ff340bfa 100644 > --- a/drivers/vdpa/pds/aux_drv.h > +++ b/drivers/vdpa/pds/aux_drv.h > @@ -4,6 +4,8 @@ > #ifndef _AUX_DRV_H_ > #define _AUX_DRV_H_ > > +#include > + > #define PDS_VDPA_DRV_DESCRIPTION"AMD/Pensando vDPA VF Device Driver" > #define PDS_VDPA_DRV_NAME KBUILD_MODNAME > > @@ -16,6 +18,7 @@ struct pds_vdpa_aux { > > int vf_id; > struct dentry *dentry; > + struct virtio_pci_modern_device vd_mdev; > > int nintrs; > }; > -- > 2.17.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v5 virtio 10/11] pds_vdpa: subscribe to the pds_core events
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > Register for the pds_core's notification events, primarily to > find out when the FW has been reset so we can pass this on > back up the chain. > > Signed-off-by: Shannon Nelson > --- > drivers/vdpa/pds/vdpa_dev.c | 68 - > drivers/vdpa/pds/vdpa_dev.h | 1 + > 2 files changed, 68 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c > index 9970657cdb3d..377eefc2fa1e 100644 > --- a/drivers/vdpa/pds/vdpa_dev.c > +++ b/drivers/vdpa/pds/vdpa_dev.c > @@ -21,6 +21,61 @@ static struct pds_vdpa_device *vdpa_to_pdsv(struct > vdpa_device *vdpa_dev) > return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev); > } > > +static int pds_vdpa_notify_handler(struct notifier_block *nb, > + unsigned long ecode, > + void *data) > +{ > + struct pds_vdpa_device *pdsv = container_of(nb, struct > pds_vdpa_device, nb); > + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; > + > + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); > + > + /* Give the upper layers a hint that something interesting > +* may have happened. It seems that the only thing this > +* triggers in the virtio-net drivers above us is a check > +* of link status. > +* > +* We don't set the NEEDS_RESET flag for EVENT_RESET > +* because we're likely going through a recovery or > +* fw_update and will be back up and running soon. > +*/ > + if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) { The code here seems to conflict with the comment above. If we don't set NEEDS_RESET, there's no need for the config callback? Thanks > + if (pdsv->config_cb.callback) > + pdsv->config_cb.callback(pdsv->config_cb.private); > + } > + > + return 0; > +} > + > +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv) > +{ > + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; > + struct notifier_block *nb = &pdsv->nb; > + int err; > + > + if (!nb->notifier_call) { > + nb->notifier_call = pds_vdpa_notify_handler; > + err = pdsc_register_notify(nb); > + if (err) { > + nb->notifier_call = NULL; > + dev_err(dev, "failed to register pds event handler: > %ps\n", > + ERR_PTR(err)); > + return -EINVAL; > + } > + dev_dbg(dev, "pds event handler registered\n"); > + } > + > + return 0; > +} > + > +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv) > +{ > + if (pdsv->nb.notifier_call) { > + pdsc_unregister_notify(&pdsv->nb); > + pdsv->nb.notifier_call = NULL; > + } > +} > + > static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, >u64 desc_addr, u64 driver_addr, u64 > device_addr) > { > @@ -522,6 +577,12 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > > pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev; > > + err = pds_vdpa_register_event_handler(pdsv); > + if (err) { > + dev_err(dev, "Failed to register for PDS events: %pe\n", > ERR_PTR(err)); > + goto err_unmap; > + } > + > /* We use the _vdpa_register_device() call rather than the > * vdpa_register_device() to avoid a deadlock because our > * dev_add() is called with the vdpa_dev_lock already set > @@ -530,13 +591,15 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs); > if (err) { > dev_err(dev, "Failed to register to vDPA bus: %pe\n", > ERR_PTR(err)); > - goto err_unmap; > + goto err_unevent; > } > > pds_vdpa_debugfs_add_vdpadev(vdpa_aux); > > return 0; > > +err_unevent: > + pds_vdpa_unregister_event_handler(pdsv); > err_unmap: > put_device(&pdsv->vdpa_dev.dev); > vdpa_aux->pdsv = NULL; > @@ -546,8 +609,11 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, > struct vdpa_device *vdpa_dev) > { > + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); > struct pds_vdpa_aux *vdpa_aux; > > + pds_vdpa_unregister_event_handler(pdsv); > + > vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); > _vdpa_unregister_device(vdpa_dev); > > diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h > index a21596f438c1..1650a2b08845 100644 > -
Re: [PATCH v5 virtio 08/11] pds_vdpa: add vdpa config client commands
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > These are the adminq commands that will be needed for > setting up and using the vDPA device. There are a number > of commands defined in the FW's API, but by making use of > the FW's virtio BAR we only need a few of these commands > for vDPA support. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > drivers/vdpa/pds/Makefile | 1 + > drivers/vdpa/pds/cmds.c | 207 > drivers/vdpa/pds/cmds.h | 20 > drivers/vdpa/pds/vdpa_dev.h | 33 +- > 4 files changed, 260 insertions(+), 1 deletion(-) > create mode 100644 drivers/vdpa/pds/cmds.c > create mode 100644 drivers/vdpa/pds/cmds.h > > diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile > index 13b50394ec64..2e22418e3ab3 100644 > --- a/drivers/vdpa/pds/Makefile > +++ b/drivers/vdpa/pds/Makefile > @@ -4,6 +4,7 @@ > obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o > > pds_vdpa-y := aux_drv.o \ > + cmds.o \ > vdpa_dev.o > > pds_vdpa-$(CONFIG_DEBUG_FS) += debugfs.o > diff --git a/drivers/vdpa/pds/cmds.c b/drivers/vdpa/pds/cmds.c > new file mode 100644 > index ..405711a0a0f8 > --- /dev/null > +++ b/drivers/vdpa/pds/cmds.c > @@ -0,0 +1,207 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ > + > +#include > +#include > + > +#include > +#include > +#include > +#include > + > +#include "vdpa_dev.h" > +#include "aux_drv.h" > +#include "cmds.h" > + > +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv) > +{ > + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; > + struct device *dev = &padev->aux_dev.dev; > + union pds_core_adminq_cmd cmd = { > + .vdpa_init.opcode = PDS_VDPA_CMD_INIT, > + .vdpa_init.vdpa_index = pdsv->vdpa_index, > + .vdpa_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), > + }; > + union pds_core_adminq_comp comp = {}; > + int err; > + > + /* Initialize the vdpa/virtio device */ > + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_init), > + &comp, 0); > + if (err) > + dev_dbg(dev, "Failed to init hw, status %d: %pe\n", > + comp.status, ERR_PTR(err)); > + > + return err; > +} > + > +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv) > +{ > + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; > + struct device *dev = &padev->aux_dev.dev; > + union pds_core_adminq_cmd cmd = { > + .vdpa.opcode = PDS_VDPA_CMD_RESET, > + .vdpa.vdpa_index = pdsv->vdpa_index, > + .vdpa.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), > + }; > + union pds_core_adminq_comp comp = {}; > + int err; > + > + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa), &comp, 0); > + if (err) > + dev_dbg(dev, "Failed to reset hw, status %d: %pe\n", > + comp.status, ERR_PTR(err)); > + > + return err; > +} > + > +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac) > +{ > + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; > + struct device *dev = &padev->aux_dev.dev; > + union pds_core_adminq_cmd cmd = { > + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, > + .vdpa_setattr.vdpa_index = pdsv->vdpa_index, > + .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id), > + .vdpa_setattr.attr = PDS_VDPA_ATTR_MAC, > + }; > + union pds_core_adminq_comp comp = {}; > + int err; > + > + ether_addr_copy(cmd.vdpa_setattr.mac, mac); > + err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr), > + &comp, 0); > + if (err) > + dev_dbg(dev, "Failed to set mac address %pM, status %d: > %pe\n", > + mac, comp.status, ERR_PTR(err)); > + > + return err; > +} > + > +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp) > +{ > + struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev; > + struct device *dev = &padev->aux_dev.dev; > + union pds_core_adminq_cmd cmd = { > + .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR, > + .vdpa_setattr.vdpa_index =
Re: [PATCH v5 virtio 09/11] pds_vdpa: add support for vdpa and vdpamgmt interfaces
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > This is the vDPA device support, where we advertise that we can > support the virtio queues and deal with the configuration work > through the pds_core's adminq. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > drivers/vdpa/pds/aux_drv.c | 15 + > drivers/vdpa/pds/aux_drv.h | 1 + > drivers/vdpa/pds/debugfs.c | 261 ++ > drivers/vdpa/pds/debugfs.h | 5 + > drivers/vdpa/pds/vdpa_dev.c | 532 +++- > 5 files changed, 813 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c > index 0c4a135b1484..186e9ee22eb1 100644 > --- a/drivers/vdpa/pds/aux_drv.c > +++ b/drivers/vdpa/pds/aux_drv.c > @@ -63,8 +63,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev, > goto err_free_mgmt_info; > } > > + /* Let vdpa know that we can provide devices */ > + err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev); > + if (err) { > + dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: > %pe\n", > + __func__, ERR_PTR(err)); > + goto err_free_virtio; > + } > + > + pds_vdpa_debugfs_add_pcidev(vdpa_aux); > + pds_vdpa_debugfs_add_ident(vdpa_aux); > + > return 0; > > +err_free_virtio: > + vp_modern_remove(&vdpa_aux->vd_mdev); > err_free_mgmt_info: > pci_free_irq_vectors(padev->vf_pdev); > err_free_mem: > @@ -79,9 +92,11 @@ static void pds_vdpa_remove(struct auxiliary_device > *aux_dev) > struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev); > struct device *dev = &aux_dev->dev; > > + vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev); > vp_modern_remove(&vdpa_aux->vd_mdev); > pci_free_irq_vectors(vdpa_aux->padev->vf_pdev); > > + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); > kfree(vdpa_aux); > auxiliary_set_drvdata(aux_dev, NULL); > > diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h > index 99e0ff340bfa..26b75344156e 100644 > --- a/drivers/vdpa/pds/aux_drv.h > +++ b/drivers/vdpa/pds/aux_drv.h > @@ -13,6 +13,7 @@ struct pds_vdpa_aux { > struct pds_auxiliary_dev *padev; > > struct vdpa_mgmt_dev vdpa_mdev; > + struct pds_vdpa_device *pdsv; > > struct pds_vdpa_ident ident; > > diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c > index d91dceb07380..0ecd0e2ec6b9 100644 > --- a/drivers/vdpa/pds/debugfs.c > +++ b/drivers/vdpa/pds/debugfs.c > @@ -10,6 +10,7 @@ > #include > > #include "aux_drv.h" > +#include "vdpa_dev.h" > #include "debugfs.h" > > static struct dentry *dbfs_dir; > @@ -24,3 +25,263 @@ void pds_vdpa_debugfs_destroy(void) > debugfs_remove_recursive(dbfs_dir); > dbfs_dir = NULL; > } > + > +#define PRINT_SBIT_NAME(__seq, __f, __name) \ > + do {\ > + if ((__f) & (__name)) \ > + seq_printf(__seq, " %s", &#__name[16]); \ > + } while (0) > + > +static void print_status_bits(struct seq_file *seq, u8 status) > +{ > + seq_puts(seq, "status:"); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED); > + seq_puts(seq, "\n"); > +} > + > +static void print_feature_bits_all(struct seq_file *seq, u64 features) > +{ > + int i; > + > + seq_puts(seq, "features:"); > + > + for (i = 0; i < (sizeof(u64) * 8); i++) { > + u64 mask = BIT_ULL(i); > + > + switch (features & mask) { > + case BIT_ULL(VIRTIO_NET_F_CSUM): > + seq_puts(seq, " VIRTIO_NET_F_CSUM"); > + break; > + case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM): > + seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM"); > + break; > + case BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS): > + seq_puts(seq, " VIRTIO_NET_F_CTRL_GUES
Re: [PATCH v5 virtio 11/11] pds_vdpa: pds_vdps.rst and Kconfig
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson wrote: > > Add the documentation and Kconfig entry for pds_vdpa driver. > > Signed-off-by: Shannon Nelson > --- > .../device_drivers/ethernet/amd/pds_vdpa.rst | 85 +++ > .../device_drivers/ethernet/index.rst | 1 + > MAINTAINERS | 4 + > drivers/vdpa/Kconfig | 8 ++ > 4 files changed, 98 insertions(+) > create mode 100644 > Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > > diff --git > a/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > new file mode 100644 > index ..587927d3de92 > --- /dev/null > +++ b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > @@ -0,0 +1,85 @@ > +.. SPDX-License-Identifier: GPL-2.0+ > +.. note: can be edited and viewed with /usr/bin/formiko-vim > + > +== > +PCI vDPA driver for the AMD/Pensando(R) DSC adapter family > +== > + > +AMD/Pensando vDPA VF Device Driver > + > +Copyright(c) 2023 Advanced Micro Devices, Inc > + > +Overview > + > + > +The ``pds_vdpa`` driver is an auxiliary bus driver that supplies > +a vDPA device for use by the virtio network stack. It is used with > +the Pensando Virtual Function devices that offer vDPA and virtio queue > +services. It depends on the ``pds_core`` driver and hardware for the PF > +and VF PCI handling as well as for device configuration services. > + > +Using the device > + > + > +The ``pds_vdpa`` device is enabled via multiple configuration steps and > +depends on the ``pds_core`` driver to create and enable SR-IOV Virtual > +Function devices. After the VFs are enabled, we enable the vDPA service > +in the ``pds_core`` device to create the auxiliary devices used by pds_vdpa. > + > +Example steps: > + > +.. code-block:: bash > + > + #!/bin/bash > + > + modprobe pds_core > + modprobe vdpa > + modprobe pds_vdpa > + > + PF_BDF=`ls /sys/module/pds_core/drivers/pci\:pds_core/*/sriov_numvfs | awk > -F / '{print $7}'` > + > + # Enable vDPA VF auxiliary device(s) in the PF > + devlink dev param set pci/$PF_BDF name enable_vnet cmode runtime value true > + > + # Create a VF for vDPA use > + echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs > + > + # Find the vDPA services/devices available > + PDS_VDPA_MGMT=`vdpa mgmtdev show | grep vDPA | head -1 | cut -d: -f1` > + > + # Create a vDPA device for use in virtio network configurations > + vdpa dev add name vdpa1 mgmtdev $PDS_VDPA_MGMT mac 00:11:22:33:44:55 > + > + # Set up an ethernet interface on the vdpa device > + modprobe virtio_vdpa > + > + > + > +Enabling the driver > +=== > + > +The driver is enabled via the standard kernel configuration system, > +using the make command:: > + > + make oldconfig/menuconfig/etc. > + > +The driver is located in the menu structure at: > + > + -> Device Drivers > +-> Network device support (NETDEVICES [=y]) > + -> Ethernet driver support > +-> Pensando devices > + -> Pensando Ethernet PDS_VDPA Support > + > +Support > +=== > + > +For general Linux networking support, please use the netdev mailing > +list, which is monitored by Pensando personnel:: > + > + net...@vger.kernel.org > + > +For more specific support needs, please use the Pensando driver support > +email:: > + > + driv...@pensando.io > diff --git a/Documentation/networking/device_drivers/ethernet/index.rst > b/Documentation/networking/device_drivers/ethernet/index.rst > index 417ca514a4d0..94ecb67c0885 100644 > --- a/Documentation/networking/device_drivers/ethernet/index.rst > +++ b/Documentation/networking/device_drivers/ethernet/index.rst > @@ -15,6 +15,7 @@ Contents: > amazon/ena > altera/altera_tse > amd/pds_core > + amd/pds_vdpa > aquantia/atlantic > chelsio/cxgb > cirrus/cs89x0 > diff --git a/MAINTAINERS b/MAINTAINERS > index ebd26b3ca90e..c565b71ce56f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -22200,6 +22200,10 @@ SNET DPU VIRTIO DATA PATH ACCELERATOR > R: Alvaro Karsz > F: drivers/vdpa/solidrun/ > > +PDS DSC VIRTIO DATA PATH ACCELERATOR > +R: Shannon Nelson > +F: drivers/vdpa/pds/ > + > VIRTIO BALLOON > M: "Michael S. Tsirkin" > M: David Hildenbrand > diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig > index cd6ad92f3f05..2ee1b288691d 100644 > --- a/drivers/vdpa/Kconfig > +++ b/drivers/vdpa/Kconfig > @@ -116,4 +116,12 @@ config ALIBABA_ENI_VDPA > This driver includes a HW monitor device that > reads health values from the DPU. > > +config PDS_VDPA > + tristate "vDPA driver for AMD/Pensando DSC devices" > + depends on PDS_CORE Need to select VIRTIO_PCI_LIB? Thanks > + help > + vDPA network
Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue
On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin wrote: > > On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote: > > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin wrote: > > > > > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote: > > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote: > > > > > > Forget to cc netdev, adding. > > > > > > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin > > > > > > wrote: > > > > > > > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote: > > > > > > > > This patch convert rx mode setting to be done in a workqueue, > > > > > > > > this is > > > > > > > > a must for allow to sleep when waiting for the cvq command to > > > > > > > > response since current code is executed under addr spin lock. > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > > > I don't like this frankly. This means that setting RX mode which > > > > > > > would > > > > > > > previously be reliable, now becomes unreliable. > > > > > > > > > > > > It is "unreliable" by design: > > > > > > > > > > > > void(*ndo_set_rx_mode)(struct net_device > > > > > > *dev); > > > > > > > > > > > > > - first of all configuration is no longer immediate > > > > > > > > > > > > Is immediate a hard requirement? I can see a workqueue is used at > > > > > > least: > > > > > > > > > > > > mlx5e, ipoib, efx, ... > > > > > > > > > > > > > and there is no way for driver to find out when > > > > > > > it actually took effect > > > > > > > > > > > > But we know rx mode is best effort e.g it doesn't support vhost and > > > > > > we > > > > > > survive from this for years. > > > > > > > > > > > > > - second, if device fails command, this is also not > > > > > > > propagated to driver, again no way for driver to find out > > > > > > > > > > > > > > VDUSE needs to be fixed to do tricks to fix this > > > > > > > without breaking normal drivers. > > > > > > > > > > > > It's not specific to VDUSE. For example, when using virtio-net in > > > > > > the > > > > > > UP environment with any software cvq (like mlx5 via vDPA or cma > > > > > > transport). > > > > > > > > > > > > Thanks > > > > > > > > > > Hmm. Can we differentiate between these use-cases? > > > > > > > > It doesn't look easy since we are drivers for virtio bus. Underlayer > > > > details were hidden from virtio-net. > > > > > > > > Or do you have any ideas on this? > > > > > > > > Thanks > > > > > > I don't know, pass some kind of flag in struct virtqueue? > > > "bool slow; /* This vq can be very slow sometimes. Don't wait for > > > it! */" > > > > > > ? > > > > > > > So if it's slow, sleep, otherwise poll? > > > > I feel setting this flag might be tricky, since the driver doesn't > > know whether or not it's really slow. E.g smartNIC vendor may allow > > virtio-net emulation over PCI. > > > > Thanks > > driver will have the choice, depending on whether > vq is deterministic or not. Ok, but the problem is, such booleans are only useful for virtio ring codes. But in this case, virtio-net knows what to do for cvq. So I'm not sure who the user is. Thanks > > > > > -- > > > MST > > > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue
On Mon, May 15, 2023 at 6:17 PM Michael S. Tsirkin wrote: > > On Mon, May 15, 2023 at 01:13:33PM +0800, Jason Wang wrote: > > On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin wrote: > > > > > > On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote: > > > > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote: > > > > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin > > > > > > wrote: > > > > > > > > > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote: > > > > > > > > Forget to cc netdev, adding. > > > > > > > > > > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote: > > > > > > > > > > This patch convert rx mode setting to be done in a > > > > > > > > > > workqueue, this is > > > > > > > > > > a must for allow to sleep when waiting for the cvq command > > > > > > > > > > to > > > > > > > > > > response since current code is executed under addr spin > > > > > > > > > > lock. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > > > > > > > I don't like this frankly. This means that setting RX mode > > > > > > > > > which would > > > > > > > > > previously be reliable, now becomes unreliable. > > > > > > > > > > > > > > > > It is "unreliable" by design: > > > > > > > > > > > > > > > > void(*ndo_set_rx_mode)(struct > > > > > > > > net_device *dev); > > > > > > > > > > > > > > > > > - first of all configuration is no longer immediate > > > > > > > > > > > > > > > > Is immediate a hard requirement? I can see a workqueue is used > > > > > > > > at least: > > > > > > > > > > > > > > > > mlx5e, ipoib, efx, ... > > > > > > > > > > > > > > > > > and there is no way for driver to find out when > > > > > > > > > it actually took effect > > > > > > > > > > > > > > > > But we know rx mode is best effort e.g it doesn't support vhost > > > > > > > > and we > > > > > > > > survive from this for years. > > > > > > > > > > > > > > > > > - second, if device fails command, this is also not > > > > > > > > > propagated to driver, again no way for driver to find out > > > > > > > > > > > > > > > > > > VDUSE needs to be fixed to do tricks to fix this > > > > > > > > > without breaking normal drivers. > > > > > > > > > > > > > > > > It's not specific to VDUSE. For example, when using virtio-net > > > > > > > > in the > > > > > > > > UP environment with any software cvq (like mlx5 via vDPA or cma > > > > > > > > transport). > > > > > > > > > > > > > > > > Thanks > > > > > > > > > > > > > > Hmm. Can we differentiate between these use-cases? > > > > > > > > > > > > It doesn't look easy since we are drivers for virtio bus. Underlayer > > > > > > details were hidden from virtio-net. > > > > > > > > > > > > Or do you have any ideas on this? > > > > > > > > > > > > Thanks > > > > > > > > > > I don't know, pass some kind of flag in struct virtqueue? > > > > > "bool slow; /* This vq can be very slow sometimes. Don't wait > > > > > for it! */" > > > > > > > > > > ? > > > > > > > > > > > > > So if it's slow, sleep, otherwise poll? > > > > > > > > I feel setting this flag might be tricky, since the driver doesn't > > > > know whether or not it's really slow. E.g smartNIC vendor may allow > > > > virtio-net emulation over PCI. > > > > > > > > Thanks > > > > > > driver will have the choice, depending on whether > > > vq is deterministic or not. > > > > Ok, but the problem is, such booleans are only useful for virtio ring > > codes. But in this case, virtio-net knows what to do for cvq. So I'm > > not sure who the user is. > > > > Thanks > > Circling back, what exactly does the architecture you are trying > to fix look like? Who is going to introduce unbounded latency? > The hypervisor? Hypervisor is one of the possible reason, we have many more: Hardware device that provides virtio-pci emulation. Userspace devices like VDUSE. > If so do we not maybe want a new feature bit > that documents this? Hypervisor then can detect old guests > that spin and decide what to do, e.g. prioritise cvq more, > or fail FEATURES_OK. We suffer from this for bare metal as well. But a question is what's wrong with the approach that is used in this patch? I've answered that set_rx_mode is not reliable, so it should be fine to use workqueue. Except for this, any other thing that worries you? Thanks > > > > > > > > > > > > -- > > > > > MST > > > > > > > > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v6 virtio 11/11] pds_vdpa: pds_vdps.rst and Kconfig
On Tue, May 16, 2023 at 10:56 AM Shannon Nelson wrote: > > Add the documentation and Kconfig entry for pds_vdpa driver. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > .../device_drivers/ethernet/amd/pds_vdpa.rst | 85 +++ > .../device_drivers/ethernet/index.rst | 1 + > MAINTAINERS | 4 + > drivers/vdpa/Kconfig | 10 +++ > 4 files changed, 100 insertions(+) > create mode 100644 > Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > > diff --git > a/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > new file mode 100644 > index ..587927d3de92 > --- /dev/null > +++ b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst > @@ -0,0 +1,85 @@ > +.. SPDX-License-Identifier: GPL-2.0+ > +.. note: can be edited and viewed with /usr/bin/formiko-vim > + > +== > +PCI vDPA driver for the AMD/Pensando(R) DSC adapter family > +== > + > +AMD/Pensando vDPA VF Device Driver > + > +Copyright(c) 2023 Advanced Micro Devices, Inc > + > +Overview > + > + > +The ``pds_vdpa`` driver is an auxiliary bus driver that supplies > +a vDPA device for use by the virtio network stack. It is used with > +the Pensando Virtual Function devices that offer vDPA and virtio queue > +services. It depends on the ``pds_core`` driver and hardware for the PF > +and VF PCI handling as well as for device configuration services. > + > +Using the device > + > + > +The ``pds_vdpa`` device is enabled via multiple configuration steps and > +depends on the ``pds_core`` driver to create and enable SR-IOV Virtual > +Function devices. After the VFs are enabled, we enable the vDPA service > +in the ``pds_core`` device to create the auxiliary devices used by pds_vdpa. > + > +Example steps: > + > +.. code-block:: bash > + > + #!/bin/bash > + > + modprobe pds_core > + modprobe vdpa > + modprobe pds_vdpa > + > + PF_BDF=`ls /sys/module/pds_core/drivers/pci\:pds_core/*/sriov_numvfs | awk > -F / '{print $7}'` > + > + # Enable vDPA VF auxiliary device(s) in the PF > + devlink dev param set pci/$PF_BDF name enable_vnet cmode runtime value true > + > + # Create a VF for vDPA use > + echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs > + > + # Find the vDPA services/devices available > + PDS_VDPA_MGMT=`vdpa mgmtdev show | grep vDPA | head -1 | cut -d: -f1` > + > + # Create a vDPA device for use in virtio network configurations > + vdpa dev add name vdpa1 mgmtdev $PDS_VDPA_MGMT mac 00:11:22:33:44:55 > + > + # Set up an ethernet interface on the vdpa device > + modprobe virtio_vdpa > + > + > + > +Enabling the driver > +=== > + > +The driver is enabled via the standard kernel configuration system, > +using the make command:: > + > + make oldconfig/menuconfig/etc. > + > +The driver is located in the menu structure at: > + > + -> Device Drivers > +-> Network device support (NETDEVICES [=y]) > + -> Ethernet driver support > +-> Pensando devices > + -> Pensando Ethernet PDS_VDPA Support > + > +Support > +=== > + > +For general Linux networking support, please use the netdev mailing > +list, which is monitored by Pensando personnel:: > + > + net...@vger.kernel.org > + > +For more specific support needs, please use the Pensando driver support > +email:: > + > + driv...@pensando.io > diff --git a/Documentation/networking/device_drivers/ethernet/index.rst > b/Documentation/networking/device_drivers/ethernet/index.rst > index 417ca514a4d0..94ecb67c0885 100644 > --- a/Documentation/networking/device_drivers/ethernet/index.rst > +++ b/Documentation/networking/device_drivers/ethernet/index.rst > @@ -15,6 +15,7 @@ Contents: > amazon/ena > altera/altera_tse > amd/pds_core > + amd/pds_vdpa > aquantia/atlantic > chelsio/cxgb > cirrus/cs89x0 > diff --git a/MAINTAINERS b/MAINTAINERS > index e2fd64c2ebdc..c3f509eeaf1d 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -22296,6 +22296,10 @@ F: include/linux/vringh.h > F: include/uapi/linux/virtio_*.h > F: tools/virtio/ > > +PDS DSC VIRTIO DATA PATH ACCELERATOR > +R: Shannon Nelson > +F: drivers/vdpa/pds/ > + > VIRTIO CRYPTO DRIVER > M: Gonglei > L: virtualization@lists.linux-foundation.org > diff --
Re: [PATCH v6 virtio 10/11] pds_vdpa: subscribe to the pds_core events
On Tue, May 16, 2023 at 10:56 AM Shannon Nelson wrote: > > Register for the pds_core's notification events, primarily to > find out when the FW has been reset so we can pass this on > back up the chain. > > Signed-off-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > drivers/vdpa/pds/vdpa_dev.c | 59 - > drivers/vdpa/pds/vdpa_dev.h | 1 + > 2 files changed, 59 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c > index 07b98dff5701..9afa803c4f21 100644 > --- a/drivers/vdpa/pds/vdpa_dev.c > +++ b/drivers/vdpa/pds/vdpa_dev.c > @@ -23,6 +23,52 @@ static struct pds_vdpa_device *vdpa_to_pdsv(struct > vdpa_device *vdpa_dev) > return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev); > } > > +static int pds_vdpa_notify_handler(struct notifier_block *nb, > + unsigned long ecode, > + void *data) > +{ > + struct pds_vdpa_device *pdsv = container_of(nb, struct > pds_vdpa_device, nb); > + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; > + > + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); > + > + if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) { > + if (pdsv->config_cb.callback) > + pdsv->config_cb.callback(pdsv->config_cb.private); > + } > + > + return 0; > +} > + > +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv) > +{ > + struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev; > + struct notifier_block *nb = &pdsv->nb; > + int err; > + > + if (!nb->notifier_call) { > + nb->notifier_call = pds_vdpa_notify_handler; > + err = pdsc_register_notify(nb); > + if (err) { > + nb->notifier_call = NULL; > + dev_err(dev, "failed to register pds event handler: > %ps\n", > + ERR_PTR(err)); > + return -EINVAL; > + } > + dev_dbg(dev, "pds event handler registered\n"); > + } > + > + return 0; > +} > + > +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv) > +{ > + if (pdsv->nb.notifier_call) { > + pdsc_unregister_notify(&pdsv->nb); > + pdsv->nb.notifier_call = NULL; > + } > +} > + > static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid, >u64 desc_addr, u64 driver_addr, u64 > device_addr) > { > @@ -594,6 +640,12 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > > pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev; > > + err = pds_vdpa_register_event_handler(pdsv); > + if (err) { > + dev_err(dev, "Failed to register for PDS events: %pe\n", > ERR_PTR(err)); > + goto err_unmap; > + } > + > /* We use the _vdpa_register_device() call rather than the > * vdpa_register_device() to avoid a deadlock because our > * dev_add() is called with the vdpa_dev_lock already set > @@ -602,13 +654,15 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs); > if (err) { > dev_err(dev, "Failed to register to vDPA bus: %pe\n", > ERR_PTR(err)); > - goto err_unmap; > + goto err_unevent; > } > > pds_vdpa_debugfs_add_vdpadev(vdpa_aux); > > return 0; > > +err_unevent: > + pds_vdpa_unregister_event_handler(pdsv); > err_unmap: > put_device(&pdsv->vdpa_dev.dev); > vdpa_aux->pdsv = NULL; > @@ -618,8 +672,11 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, > const char *name, > static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, > struct vdpa_device *vdpa_dev) > { > + struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev); > struct pds_vdpa_aux *vdpa_aux; > > + pds_vdpa_unregister_event_handler(pdsv); > + > vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev); > _vdpa_unregister_device(vdpa_dev); > > diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h > index 25c1d192f0ef..a1bc37de9537 100644 > --- a/drivers/vd
Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue
On Tue, May 16, 2023 at 12:13 PM Michael S. Tsirkin wrote: > > On Tue, May 16, 2023 at 10:44:45AM +0800, Jason Wang wrote: > > On Mon, May 15, 2023 at 6:17 PM Michael S. Tsirkin wrote: > > > > > > On Mon, May 15, 2023 at 01:13:33PM +0800, Jason Wang wrote: > > > > On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote: > > > > > > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin > > > > > > wrote: > > > > > > > > > > > > > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote: > > > > > > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote: > > > > > > > > > > Forget to cc netdev, adding. > > > > > > > > > > > > > > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin > > > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang > > > > > > > > > > > wrote: > > > > > > > > > > > > This patch convert rx mode setting to be done in a > > > > > > > > > > > > workqueue, this is > > > > > > > > > > > > a must for allow to sleep when waiting for the cvq > > > > > > > > > > > > command to > > > > > > > > > > > > response since current code is executed under addr spin > > > > > > > > > > > > lock. > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > > > > > > > > > > > I don't like this frankly. This means that setting RX > > > > > > > > > > > mode which would > > > > > > > > > > > previously be reliable, now becomes unreliable. > > > > > > > > > > > > > > > > > > > > It is "unreliable" by design: > > > > > > > > > > > > > > > > > > > > void(*ndo_set_rx_mode)(struct > > > > > > > > > > net_device *dev); > > > > > > > > > > > > > > > > > > > > > - first of all configuration is no longer immediate > > > > > > > > > > > > > > > > > > > > Is immediate a hard requirement? I can see a workqueue is > > > > > > > > > > used at least: > > > > > > > > > > > > > > > > > > > > mlx5e, ipoib, efx, ... > > > > > > > > > > > > > > > > > > > > > and there is no way for driver to find out when > > > > > > > > > > > it actually took effect > > > > > > > > > > > > > > > > > > > > But we know rx mode is best effort e.g it doesn't support > > > > > > > > > > vhost and we > > > > > > > > > > survive from this for years. > > > > > > > > > > > > > > > > > > > > > - second, if device fails command, this is also not > > > > > > > > > > > propagated to driver, again no way for driver to find > > > > > > > > > > > out > > > > > > > > > > > > > > > > > > > > > > VDUSE needs to be fixed to do tricks to fix this > > > > > > > > > > > without breaking normal drivers. > > > > > > > > > > > > > > > > > > > > It's not specific to VDUSE. For example, when using > > > > > > > > > > virtio-net in the > > > > > > > > > > UP environment with any software cvq (like mlx5 via vDPA or > > > > > > > > > > cma > >
Re: [PATCH net-next V2 2/2] virtio-net: sleep instead of busy waiting for cvq command
On Wed, May 17, 2023 at 4:54 AM Michael S. Tsirkin wrote: > > On Thu, Apr 13, 2023 at 02:40:27PM +0800, Jason Wang wrote: > > We used to busy waiting on the cvq command this tends to be > > problematic since there no way for to schedule another process which > > may serve for the control virtqueue. This might be the case when the > > control virtqueue is emulated by software. This patch switches to use > > completion to allow the CPU to sleep instead of busy waiting for the > > cvq command. > > > > Signed-off-by: Jason Wang > > --- > > Changes since V1: > > - use completion for simplicity > > - don't try to harden the CVQ command which requires more thought > > Changes since RFC: > > - break the device when timeout > > - get buffer manually since the virtio core check more_used() instead > > --- > > drivers/net/virtio_net.c | 21 ++--- > > 1 file changed, 14 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 2e56bbf86894..d3eb8fd6c9dc 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -19,6 +19,7 @@ > > #include > > #include > > #include > > +#include > > #include > > #include > > #include > > @@ -295,6 +296,8 @@ struct virtnet_info { > > > > /* failover when STANDBY feature enabled */ > > struct failover *failover; > > + > > + struct completion completion; > > }; > > > > struct padded_vnet_hdr { > > @@ -1709,6 +1712,13 @@ static bool try_fill_recv(struct virtnet_info *vi, > > struct receive_queue *rq, > > return !oom; > > } > > > > +static void virtnet_cvq_done(struct virtqueue *cvq) > > +{ > > + struct virtnet_info *vi = cvq->vdev->priv; > > + > > + complete(&vi->completion); > > +} > > + > > static void skb_recv_done(struct virtqueue *rvq) > > { > > struct virtnet_info *vi = rvq->vdev->priv; > > @@ -2169,12 +2179,8 @@ static bool virtnet_send_command(struct virtnet_info > > *vi, u8 class, u8 cmd, > > if (unlikely(!virtqueue_kick(vi->cvq))) > > return vi->ctrl->status == VIRTIO_NET_OK; > > > > - /* Spin for a response, the kick causes an ioport write, trapping > > - * into the hypervisor, so the request should be handled immediately. > > - */ > > - while (!virtqueue_get_buf(vi->cvq, &tmp) && > > -!virtqueue_is_broken(vi->cvq)) > > - cpu_relax(); > > + wait_for_completion(&vi->completion); > > + virtqueue_get_buf(vi->cvq, &tmp); > > > > return vi->ctrl->status == VIRTIO_NET_OK; > > This seems to break surprise removal and other > situations where vq gets broken since callbacks > aren't usually invoked then. Yes, so I think I can go back to the original idea by simply adding cond_resched() here. > > > > } > > @@ -3672,7 +3678,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi) > > > > /* Parameters for control virtqueue, if any */ > > if (vi->has_cvq) { > > - callbacks[total_vqs - 1] = NULL; > > + callbacks[total_vqs - 1] = virtnet_cvq_done; > > names[total_vqs - 1] = "control"; > > } > > > > There is a cost to this, in that we are burning an extra MSI vector > for the slow path cvq. if device has 3 vectors, suddenly we can't > allocate vectors for rx and tx, big problem. > > So I'm afraid we need to pass a new flag that will share > the config changed interrupt and cvq. See above, it looks to me a simple cond_resched() is sufficient, then we don't need a new vector. Thanks > > > > > @@ -4122,6 +4128,7 @@ static int virtnet_probe(struct virtio_device *vdev) > > if (vi->has_rss || vi->has_rss_hash_report) > > virtnet_init_default_rss(vi); > > > > + init_completion(&vi->completion); > > enable_rx_mode_work(vi); > > > > /* serialize netdev register + virtio_device_ready() with ndo_open() > > */ > > -- > > 2.25.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 13/14] vhost: allow userspace to create workers
On Sat, Apr 29, 2023 at 12:32 AM Mike Christie wrote: > > For vhost-scsi with 3 vqs or more and a workload that tries to use > them in parallel like: > > fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ > --ioengine=libaio --iodepth=128 --numjobs=3 > > the single vhost worker thread will become a bottlneck and we are stuck > at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are > used. > > To better utilize virtqueues and available CPUs, this patch allows > userspace to create workers and bind them to vqs. You can have N workers > per dev and also share N workers with M vqs on that dev. > > This patch adds the interface related code and the next patch will hook > vhost-scsi into it. The patches do not try to hook net and vsock into > the interface because: > > 1. multiple workers don't seem to help vsock. The problem is that with > only 2 virtqueues we never fully use the existing worker when doing > bidirectional tests. This seems to match vhost-scsi where we don't see > the worker as a bottleneck until 3 virtqueues are used. > > 2. net already has a way to use multiple workers. > > Signed-off-by: Mike Christie > --- > drivers/vhost/vhost.c| 145 ++- > drivers/vhost/vhost.h| 3 + > include/uapi/linux/vhost.h | 33 +++ > include/uapi/linux/vhost_types.h | 16 > 4 files changed, 196 insertions(+), 1 deletion(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 4b0b82292379..e8f829f35814 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct > vhost_dev *dev) > return NULL; > } > > +/* Caller must have device mutex */ > +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, > +struct vhost_worker *worker) > +{ > + if (vq->worker) > + vq->worker->attachment_cnt--; > + worker->attachment_cnt++; > + vq->worker = worker; > +} > + > +/** > + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command > + * @vq: the virtqueue we will set the worker for > + * @info: the worker userspace has requested us to use > + * > + * We only allow userspace to set a virtqueue's worker if it's not active and > + * polling is not enabled. I wonder if we can mandate this in the code like check the vq backend in vhost_vq_work_queue(). We also assume drivers supporting this will not be > + * internally queueing works directly or via calls like vhost_dev_flush at > + * this time. > + * > + * Caller must have device and virtqueue mutex. > + */ > +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, > + struct vhost_vring_worker *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_dev *dev = vq->dev; > + struct vhost_worker *worker; > + > + if (!dev->use_worker) > + return -EINVAL; > + > + if (vhost_vq_get_backend(vq) || vq->kick) It might be worthwhile to have a comment to explain why we need to check vq->kick here. This also means the device should not queue work when the backend is NULL. But I found it is probably not the case for vsock, it calls vhost_poll_queue() in vhost_transport_cancel_pkt() but vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq, NULL); Net seems to be fine since it waits for ubufs to be completed in vhost_net_set_backend(). Can we make things easier by migrating the work_list? I also worry if there are other corner cases which makes me think how hard it is if we can just support those ioctls after the backend is set? > + return -EBUSY; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + __vhost_vq_attach_worker(vq, worker); > + return 0; > +} > + > +/* Caller must have device mutex */ > +static int vhost_new_worker(struct vhost_dev *dev, > + struct vhost_worker_state *info) > +{ > + struct vhost_worker *worker; > + > + worker = vhost_worker_create(dev); > + if (!worker) > + return -ENOMEM; > + > + info->worker_id = worker->id; > + return 0; > +} > + > +static int vhost_free_worker(struct vhost_dev *dev, > +struct vhost_worker_state *info) > +{ > + unsigned long index = info->worker_id; > + struct vhost_worker *worker; > + > + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); > + if (!worker || worker->id != info->worker_id) > + return -ENODEV; > + > + if (worker->attachment_cnt) > + return -EBUSY; > + > + vhost_worker_destroy(dev, worker); > + return 0; > +} > + > static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, >
Re: [PATCH v1] virtio_pci: Optimize virtio_pci_device structure size
On Tue, May 16, 2023 at 9:55 PM Feng Liu wrote: > > Improve the size of the virtio_pci_device structure, which is commonly > used to represent a virtio PCI device. A given virtio PCI device can > either of legacy type or modern type, with the > struct virtio_pci_legacy_device occupying 32 bytes and the > struct virtio_pci_modern_device occupying 88 bytes. Make them a union, > thereby save 32 bytes of memory as shown by the pahole tool. This > improvement is particularly beneficial when dealing with numerous > devices, as it helps conserve memory resources. > > Before the modification, pahole tool reported the following: > struct virtio_pci_device { > [...] > struct virtio_pci_legacy_device ldev;/* 82432 */ > /* --- cacheline 13 boundary (832 bytes) was 24 bytes ago --- */ > struct virtio_pci_modern_device mdev;/* 85688 */ > > /* XXX last struct has 4 bytes of padding */ > [...] > /* size: 1056, cachelines: 17, members: 19 */ > [...] > }; > > After the modification, pahole tool reported the following: > struct virtio_pci_device { > [...] > union { > struct virtio_pci_legacy_device ldev;/* 82432 */ > struct virtio_pci_modern_device mdev;/* 82488 */ > }; /* 82488 */ > [...] > /* size: 1024, cachelines: 16, members: 18 */ > [...] > }; > > Signed-off-by: Feng Liu > Reviewed-by: Jiri Pirko Acked-by: Jason Wang Thanks > --- > drivers/virtio/virtio_pci_common.h | 7 --- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/drivers/virtio/virtio_pci_common.h > b/drivers/virtio/virtio_pci_common.h > index 23112d84218f..4b773bd7c58c 100644 > --- a/drivers/virtio/virtio_pci_common.h > +++ b/drivers/virtio/virtio_pci_common.h > @@ -45,9 +45,10 @@ struct virtio_pci_vq_info { > struct virtio_pci_device { > struct virtio_device vdev; > struct pci_dev *pci_dev; > - struct virtio_pci_legacy_device ldev; > - struct virtio_pci_modern_device mdev; > - > + union { > + struct virtio_pci_legacy_device ldev; > + struct virtio_pci_modern_device mdev; > + }; > bool is_legacy; > > /* Where to read and clear interrupt */ > -- > 2.37.1 (Apple Git-137.1) > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vdpa/mlx5: Fix hang when cvq commands are triggered during device unregister
2263.129355] ? process_one_work+0x3c0/0x3c0 > [ 2263.129766] worker_thread+0x4d/0x3c0 > [ 2263.130140] ? process_one_work+0x3c0/0x3c0 > [ 2263.130548] kthread+0xb9/0xe0 > [ 2263.130895] ? kthread_complete_and_exit+0x20/0x20 > [ 2263.131349] ret_from_fork+0x1f/0x30 > [ 2263.131717] > > The fix is to disable and destroy the workqueue after the device > unregister. It is expected that vhost will not trigger kicks after > the unregister. But even if it would, the wq is disabled already by > setting the pointer to NULL (done so in the referenced commit). > > Fixes: ad6dc1daaf29 ("vdpa/mlx5: Avoid processing works if workqueue was > destroyed") > Signed-off-by: Dragos Tatulea Acked-by: Jason Wang Thanks > --- > drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c > b/drivers/vdpa/mlx5/net/mlx5_vnet.c > index e29e32b306ad..279ac6a558d2 100644 > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c > @@ -3349,10 +3349,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev > *v_mdev, struct vdpa_device * > mlx5_vdpa_remove_debugfs(ndev->debugfs); > ndev->debugfs = NULL; > unregister_link_notifier(ndev); > + _vdpa_unregister_device(dev); > wq = mvdev->wq; > mvdev->wq = NULL; > destroy_workqueue(wq); > - _vdpa_unregister_device(dev); > mgtdev->ndev = NULL; > } > > -- > 2.40.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base
On Wed, May 17, 2023 at 2:26 AM Shannon Nelson wrote: > > On 5/16/23 12:49 AM, Stefano Garzarella wrote: > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote: > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote: > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via > >>> Virtualization wrote: > Use the right structs for PACKED or split vqs when setting and > getting the vring base. > > Signed-off-by: Shannon Nelson > --- > drivers/vhost/vhost.c | 18 +- > drivers/vhost/vhost.h | 8 ++-- > 2 files changed, 19 insertions(+), 7 deletions(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index f11bdbe4c2c5..f64efda48f21 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev > *d, unsigned int ioctl, void __user *arg > r = -EFAULT; > break; > } > - if (s.num > 0x) { > - r = -EINVAL; > - break; > + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { > + vq->last_avail_idx = s.num & 0x; > + vq->last_used_idx = (s.num >> 16) & 0x; > + } else { > + if (s.num > 0x) { > + r = -EINVAL; > + break; > + } > + vq->last_avail_idx = s.num; > } > - vq->last_avail_idx = s.num; > /* Forget the cached index value. */ > vq->avail_idx = vq->last_avail_idx; > break; > case VHOST_GET_VRING_BASE: > s.index = idx; > - s.num = vq->last_avail_idx; > + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) > + s.num = (u32)vq->last_avail_idx | > ((u32)vq->last_used_idx << 16); > + else > + s.num = vq->last_avail_idx; > >>> > >>> The changes LGTM, but since we are changing the UAPI, should we > >>> update the documentation of VHOST_SET_VRING_BASE and > >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h? > >> > >> Correct me if I'm wrong, but I don't think we're changing anything in > >> the UAPI here, just fixing code to work correctly with what is already > >> happening. > > > > IIUC before this patch VHOST_GET_VRING_BASE and VHOST_SET_VRING_BASE > > never worked with packed virtqueue, since we were only handling > > last_avail_idx. Now we are supporting packed virtqueue, handling > > in vhost_vring_state.num both last_avail_idx and last_used_idx (with > > wrap counters). > > > > For example for VHOST_GET_VRING_BASE where is documented that the first > > 15 bits are last_avail_idx, the 16th the avail_wrap_counter, and the > > others are last_used_idx and used_wrap_counter? > > > > Maybe I missed something, but since this is UAPI, IMHO we should > > document the parameters of ioctls at least in > > include/uapi/linux/vhost.h. > > Perhaps Jason already has something written up that could be put in here > from when he first added the wrap_counter a couple of years ago? If you meant the virtio driver support for packed, I think it's different from the context which is vhost here. I agree with Stefano that we need to update the comments around GET_VRING_BASE and SET_VRING_BASE, then we are fine. Thanks > > sln > > > > > Thanks, > > Stefano > > > > -- > > You received this message because you are subscribed to the Google > > Groups "Pensando Drivers" group. > > To unsubscribe from this group and stop receiving emails from it, send > > an email to drivers+unsubscr...@pensando.io. > > To view this discussion on the web visit > > https://groups.google.com/a/pensando.io/d/msgid/drivers/q6cmfha36sdkgflwwd3pr4sw7rgajag4ahgjbpfjrr76w4o2b6%403yc7zs5u65s4. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vdpa: consume device_features parameter
On Sat, May 13, 2023 at 12:42 AM Shannon Nelson wrote: > > From: Allen Hubbe > > Consume the parameter to device_features when parsing command line > options. Otherwise the parameter may be used again as an option name. > > # vdpa dev add ... device_features 0xdeadbeef mac 00:11:22:33:44:55 > Unknown option "0xdeadbeef" > > Fixes: a4442ce58ebb ("vdpa: allow provisioning device features") > Signed-off-by: Allen Hubbe > Reviewed-by: Shannon Nelson Acked-by: Jason Wang Thanks > --- > vdpa/vdpa.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/vdpa/vdpa.c b/vdpa/vdpa.c > index 27647d73d498..8a2fca8647b6 100644 > --- a/vdpa/vdpa.c > +++ b/vdpa/vdpa.c > @@ -353,6 +353,8 @@ static int vdpa_argv_parse(struct vdpa *vdpa, int argc, > char **argv, > &opts->device_features); > if (err) > return err; > + > + NEXT_ARG_FWD(); > o_found |= VDPA_OPT_VDEV_FEATURES; > } else { > fprintf(stderr, "Unknown option \"%s\"\n", *argv); > -- > 2.17.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RESEND PATCH] vdpa: solidrun: constify pointers to hwmon_channel_info
On Fri, May 12, 2023 at 1:54 AM Krzysztof Kozlowski wrote: > > Statically allocated array of pointers to hwmon_channel_info can be made > const for safety. > > Acked-by: Michael S. Tsirkin > Reviewed-by: Alvaro Karsz > Signed-off-by: Krzysztof Kozlowski Acked-by: Jason Wang Thanks > --- > drivers/vdpa/solidrun/snet_hwmon.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/vdpa/solidrun/snet_hwmon.c > b/drivers/vdpa/solidrun/snet_hwmon.c > index 42c87387a0f1..af531a339082 100644 > --- a/drivers/vdpa/solidrun/snet_hwmon.c > +++ b/drivers/vdpa/solidrun/snet_hwmon.c > @@ -159,7 +159,7 @@ static const struct hwmon_ops snet_hwmon_ops = { > .read_string = snet_hwmon_read_string > }; > > -static const struct hwmon_channel_info *snet_hwmon_info[] = { > +static const struct hwmon_channel_info * const snet_hwmon_info[] = { > HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT | > HWMON_T_LABEL, >HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LABEL), > HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL), > -- > 2.34.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base
On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella wrote: > > On Wed, May 17, 2023 at 7:26 AM Jason Wang wrote: > > > > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson > > wrote: > > > > > > On 5/16/23 12:49 AM, Stefano Garzarella wrote: > > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote: > > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote: > > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via > > > >>> Virtualization wrote: > > > >>>> Use the right structs for PACKED or split vqs when setting and > > > >>>> getting the vring base. > > > >>>> > > > >>>> Signed-off-by: Shannon Nelson > > > >>>> --- > > > >>>> drivers/vhost/vhost.c | 18 +- > > > >>>> drivers/vhost/vhost.h | 8 ++-- > > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-) > > > >>>> > > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > >>>> index f11bdbe4c2c5..f64efda48f21 100644 > > > >>>> --- a/drivers/vhost/vhost.c > > > >>>> +++ b/drivers/vhost/vhost.c > > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev > > > >>>> *d, unsigned int ioctl, void __user *arg > > > >>>> r = -EFAULT; > > > >>>> break; > > > >>>> } > > > >>>> - if (s.num > 0x) { > > > >>>> - r = -EINVAL; > > > >>>> - break; > > > >>>> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { > > > >>>> + vq->last_avail_idx = s.num & 0x; > > > >>>> + vq->last_used_idx = (s.num >> 16) & 0x; > > > >>>> + } else { > > > >>>> + if (s.num > 0x) { > > > >>>> + r = -EINVAL; > > > >>>> + break; > > > >>>> + } > > > >>>> + vq->last_avail_idx = s.num; > > > >>>> } > > > >>>> - vq->last_avail_idx = s.num; > > > >>>> /* Forget the cached index value. */ > > > >>>> vq->avail_idx = vq->last_avail_idx; > > > >>>> break; > > > >>>> case VHOST_GET_VRING_BASE: > > > >>>> s.index = idx; > > > >>>> - s.num = vq->last_avail_idx; > > > >>>> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) > > > >>>> + s.num = (u32)vq->last_avail_idx | > > > >>>> ((u32)vq->last_used_idx << 16); > > > >>>> + else > > > >>>> + s.num = vq->last_avail_idx; > > > >>> > > > >>> The changes LGTM, but since we are changing the UAPI, should we > > > >>> update the documentation of VHOST_SET_VRING_BASE and > > > >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h? > > > >> > > > >> Correct me if I'm wrong, but I don't think we're changing anything in > > > >> the UAPI here, just fixing code to work correctly with what is already > > > >> happening. > > > > > > > > IIUC before this patch VHOST_GET_VRING_BASE and VHOST_SET_VRING_BASE > > > > never worked with packed virtqueue, since we were only handling > > > > last_avail_idx. Now we are supporting packed virtqueue, handling > > > > in vhost_vring_state.num both last_avail_idx and last_used_idx (with > > > > wrap counters). > > > > > > > > For example for VHOST_GET_VRING_BASE where is documented that the first > > > > 15 bits are last_avail_idx, the 16th the avail_wrap_counter, and the > > > > others are last_used_idx and used_wrap_counter? > > > > > > > > Maybe I missed something, but since this is UAPI, IMHO we should > > > > document the parameters of ioctls at least in > > > > include/uapi/linux/vhost.h. > > > > > > Perhaps Jason already has something written up that could be put in here > > > from when he first added the wrap_counter a couple of years ago? > > > > If you meant the virtio driver support for packed, I think it's > > different from the context which is vhost here. > > > > I agree with Stefano that we need to update the comments around > > GET_VRING_BASE and SET_VRING_BASE, then we are fine. > > I'm thinking if we should also add a new VHOST_BACKEND_F_RING_PACKED > feature (or something similar) to inform the user space that now we > are able to handle packed virtqueue through vhost IOCTLs, otherwise > how can the userspace know if it is supported or not? I probably understand this but I think it should be done via VHOST_GET_FEAETURES. It would be a burden if we matianing duplicated features. Thanks > > Thanks, > Stefano > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v9 01/12] virtio_ring: put mapping error check in vring_map_one_sg
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: > > This patch put the dma addr error check in vring_map_one_sg(). > > The benefits of doing this: > > 1. make vring_map_one_sg more simple, without calling >vring_mapping_error to check the return value. > 2. reduce one judgment of vq->use_dma_api. Code looks fine but it's better to explain how it relates or simply anything with this series. Thanks > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 37 +--- > 1 file changed, 22 insertions(+), 15 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index c5310eaf8b46..c563215be6b9 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -355,9 +355,8 @@ static struct device *vring_dma_dev(const struct > vring_virtqueue *vq) > } > > /* Map one sg entry. */ > -static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, > - struct scatterlist *sg, > - enum dma_data_direction direction) > +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > scatterlist *sg, > + enum dma_data_direction direction, static > dma_addr_t *addr) > { > if (!vq->use_dma_api) { > /* > @@ -366,7 +365,8 @@ static dma_addr_t vring_map_one_sg(const struct > vring_virtqueue *vq, > * depending on the direction. > */ > kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, > direction); > - return (dma_addr_t)sg_phys(sg); > + *addr = (dma_addr_t)sg_phys(sg); > + return 0; > } > > /* > @@ -374,9 +374,14 @@ static dma_addr_t vring_map_one_sg(const struct > vring_virtqueue *vq, > * the way it expects (we don't guarantee that the scatterlist > * will exist for the lifetime of the mapping). > */ > - return dma_map_page(vring_dma_dev(vq), > + *addr = dma_map_page(vring_dma_dev(vq), > sg_page(sg), sg->offset, sg->length, > direction); > + > + if (dma_mapping_error(vring_dma_dev(vq), *addr)) > + return -ENOMEM; > + > + return 0; > } > > static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, > @@ -588,8 +593,9 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > > for (n = 0; n < out_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, > DMA_TO_DEVICE); > - if (vring_mapping_error(vq, addr)) > + dma_addr_t addr; > + > + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) > goto unmap_release; > > prev = i; > @@ -603,8 +609,9 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > } > for (; n < (out_sgs + in_sgs); n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, > DMA_FROM_DEVICE); > - if (vring_mapping_error(vq, addr)) > + dma_addr_t addr; > + > + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) > goto unmap_release; > > prev = i; > @@ -1279,9 +1286,8 @@ static int virtqueue_add_indirect_packed(struct > vring_virtqueue *vq, > > for (n = 0; n < out_sgs + in_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - addr = vring_map_one_sg(vq, sg, n < out_sgs ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE); > - if (vring_mapping_error(vq, addr)) > + if (vring_map_one_sg(vq, sg, n < out_sgs ? > +DMA_TO_DEVICE : DMA_FROM_DEVICE, > &addr)) > goto unmap_release; > > desc[i].flags = cpu_to_le16(n < out_sgs ? > @@ -1426,9 +1432,10 @@ static inline int virtqueue_add_packed(struct > virtqueue *_vq, > c = 0; > for (n = 0; n < out_sgs + in_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, n < > out_sgs ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE); > - if (vring_mapping_error(vq, addr)) > + dma_addr_t addr; > + > + if (vring_map_one_sg(vq, sg, n < out_sgs ? > +DMA_TO_DEVICE : DMA_FROM_DEVICE, > &addr)) > goto unmap_release; > > flags = cpu_to_le16(vq->packed.av
Re: [PATCH vhost v9 02/12] virtio_ring: simplify the reference of desc state inside detach_buf_split()
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: > > The purpose of this is to simplify the reference to state. It is > convenient for subsequent commit. It's better to be verbose, e.g how it can simplify the following patches. Thanks > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 12 +++- > 1 file changed, 7 insertions(+), 5 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index c563215be6b9..479203346c36 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -744,11 +744,14 @@ static bool virtqueue_kick_prepare_split(struct > virtqueue *_vq) > static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, > void **ctx) > { > + struct vring_desc_state_split *state; > unsigned int i, j; > __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); > > + state = &vq->split.desc_state[head]; > + > /* Clear data ptr. */ > - vq->split.desc_state[head].data = NULL; > + state->data = NULL; > > /* Put back on free list: unmap first-level descriptors and find end > */ > i = head; > @@ -767,8 +770,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, > unsigned int head, > vq->vq.num_free++; > > if (vq->indirect) { > - struct vring_desc *indir_desc = > - vq->split.desc_state[head].indir_desc; > + struct vring_desc *indir_desc = state->indir_desc; > u32 len; > > /* Free the indirect table, if any, now that it's unmapped. */ > @@ -785,9 +787,9 @@ static void detach_buf_split(struct vring_virtqueue *vq, > unsigned int head, > vring_unmap_one_split_indirect(vq, &indir_desc[j]); > > kfree(indir_desc); > - vq->split.desc_state[head].indir_desc = NULL; > + state->indir_desc = NULL; > } else if (ctx) { > - *ctx = vq->split.desc_state[head].indir_desc; > + *ctx = state->indir_desc; > } > } > > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v9 03/12] virtio_ring: check use_dma_api before unmap desc for indirect
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: > > Inside detach_buf_split(), if use_dma_api is false, > vring_unmap_one_split_indirect will be called many times, but actually > nothing is done. So this patch check use_dma_api firstly. > > Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Thanks > --- > drivers/virtio/virtio_ring.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 479203346c36..1ffab1eb40c0 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -783,8 +783,10 @@ static void detach_buf_split(struct vring_virtqueue *vq, > unsigned int head, > VRING_DESC_F_INDIRECT)); > BUG_ON(len == 0 || len % sizeof(struct vring_desc)); > > - for (j = 0; j < len / sizeof(struct vring_desc); j++) > - vring_unmap_one_split_indirect(vq, &indir_desc[j]); > + if (vq->use_dma_api) { > + for (j = 0; j < len / sizeof(struct vring_desc); j++) > + vring_unmap_one_split_indirect(vq, > &indir_desc[j]); > + } > > kfree(indir_desc); > state->indir_desc = NULL; > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v9 04/12] virtio_ring: virtqueue_add() support premapped
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: > > virtuque_add() adds parameter premapped. I wonder if this patch is over simplified. Maybe it can be squashed with the patch that implements the premapped logic. Thanks > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 1ffab1eb40c0..e2fc50c05bec 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -2135,6 +2135,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, > unsigned int in_sgs, > void *data, > void *ctx, > + bool premapped, > gfp_t gfp) > { > struct vring_virtqueue *vq = to_vvq(_vq); > @@ -2176,7 +2177,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq, > total_sg++; > } > return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, > -data, NULL, gfp); > +data, NULL, false, gfp); > } > EXPORT_SYMBOL_GPL(virtqueue_add_sgs); > > @@ -2198,7 +2199,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq, > void *data, > gfp_t gfp) > { > - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); > + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp); > } > EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); > > @@ -2220,7 +2221,7 @@ int virtqueue_add_inbuf(struct virtqueue *vq, > void *data, > gfp_t gfp) > { > - return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); > + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp); > } > EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); > > @@ -2244,7 +2245,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, > void *ctx, > gfp_t gfp) > { > - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); > + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp); > } > EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); > > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: > > virtqueue_add_split() only supports virtual addresses, dma is completed > in virtqueue_add_split(). > > In some scenarios (such as the AF_XDP scenario), the memory is allocated > and DMA is completed in advance, so it is necessary for us to support > passing the DMA address to virtqueue_add_split(). > > Record this information in desc_state, we can skip unmap based on this > when executing dma unmap. I would also suggest documenting why a per descriptor metadata is needed instead of a per virtqueue one. > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 38 +++- > 1 file changed, 29 insertions(+), 9 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index e2fc50c05bec..bd5e84afab37 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -70,6 +70,7 @@ > struct vring_desc_state_split { > void *data; /* Data for callback. */ > struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ > + bool premapped; /* DMA mapping is done by driver. */ Going back to the original discussion around where this should be placed. I wonder if we can find a common place to store this since it has nothing related to virtqueue layout. Maybe desc_extra? And it would be even better if we can avoid stressing the cache like above. > }; > > struct vring_desc_state_packed { > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct > vring_virtqueue *vq) > > /* Map one sg entry. */ > static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > scatterlist *sg, > - enum dma_data_direction direction, static > dma_addr_t *addr) > + enum dma_data_direction direction, > + bool premapped, dma_addr_t *addr) having things like: int func(bool do) { if (!do) return; } is a hint that the check needs to be done by the caller? And this change should work for both packed and split. I think we need to squash the packed changes here. Looking at how packed virtqueue uses this in this patch, I don't think this patch can even be built. I will wait for a new version and continue the review from there. Thanks > { > + if (premapped) { > + *addr = sg_dma_address(sg); > + return 0; > + } > + > if (!vq->use_dma_api) { > /* > * If DMA is not used, KMSAN doesn't know that the scatterlist > @@ -445,7 +452,7 @@ static void vring_unmap_one_split_indirect(const struct > vring_virtqueue *vq, > } > > static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, > - unsigned int i) > + unsigned int i, bool premapped) > { > struct vring_desc_extra *extra = vq->split.desc_extra; > u16 flags; > @@ -462,6 +469,9 @@ static unsigned int vring_unmap_one_split(const struct > vring_virtqueue *vq, > (flags & VRING_DESC_F_WRITE) ? > DMA_FROM_DEVICE : DMA_TO_DEVICE); > } else { > + if (premapped) > + goto out; > + > dma_unmap_page(vring_dma_dev(vq), >extra[i].addr, >extra[i].len, > @@ -532,6 +542,7 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > unsigned int in_sgs, > void *data, > void *ctx, > + bool premapped, > gfp_t gfp) > { > struct vring_virtqueue *vq = to_vvq(_vq); > @@ -595,7 +606,7 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > dma_addr_t addr; > > - if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) > + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, > premapped, &addr)) > goto unmap_release; > > prev = i; > @@ -611,7 +622,7 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > dma_addr_t addr; > > - if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) > + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, > premapped, &addr)) > goto unmap_release; > > prev = i; > @@ -657,6 +668,7 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > > /* Store token and indirect buffer state. */ > vq->split.
Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base
On Thu, May 18, 2023 at 3:34 PM Stefano Garzarella wrote: > > On Thu, May 18, 2023 at 7:24 AM Jason Wang wrote: > > > > On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella > > wrote: > > > > > > On Wed, May 17, 2023 at 7:26 AM Jason Wang wrote: > > > > > > > > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson > > > > wrote: > > > > > > > > > > On 5/16/23 12:49 AM, Stefano Garzarella wrote: > > > > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote: > > > > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote: > > > > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via > > > > > >>> Virtualization wrote: > > > > > >>>> Use the right structs for PACKED or split vqs when setting and > > > > > >>>> getting the vring base. > > > > > >>>> > > > > > >>>> Signed-off-by: Shannon Nelson > > > > > >>>> --- > > > > > >>>> drivers/vhost/vhost.c | 18 +- > > > > > >>>> drivers/vhost/vhost.h | 8 ++-- > > > > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > >>>> > > > > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > >>>> index f11bdbe4c2c5..f64efda48f21 100644 > > > > > >>>> --- a/drivers/vhost/vhost.c > > > > > >>>> +++ b/drivers/vhost/vhost.c > > > > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev > > > > > >>>> *d, unsigned int ioctl, void __user *arg > > > > > >>>> r = -EFAULT; > > > > > >>>> break; > > > > > >>>> } > > > > > >>>> - if (s.num > 0x) { > > > > > >>>> - r = -EINVAL; > > > > > >>>> - break; > > > > > >>>> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { > > > > > >>>> + vq->last_avail_idx = s.num & 0x; > > > > > >>>> + vq->last_used_idx = (s.num >> 16) & > > > > > >>>> 0x; > > > > > >>>> + } else { > > > > > >>>> + if (s.num > 0x) { > > > > > >>>> + r = -EINVAL; > > > > > >>>> + break; > > > > > >>>> + } > > > > > >>>> + vq->last_avail_idx = s.num; > > > > > >>>> } > > > > > >>>> - vq->last_avail_idx = s.num; > > > > > >>>> /* Forget the cached index value. */ > > > > > >>>> vq->avail_idx = vq->last_avail_idx; > > > > > >>>> break; > > > > > >>>> case VHOST_GET_VRING_BASE: > > > > > >>>> s.index = idx; > > > > > >>>> - s.num = vq->last_avail_idx; > > > > > >>>> + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) > > > > > >>>> + s.num = (u32)vq->last_avail_idx | > > > > > >>>> ((u32)vq->last_used_idx << 16); > > > > > >>>> + else > > > > > >>>> + s.num = vq->last_avail_idx; > > > > > >>> > > > > > >>> The changes LGTM, but since we are changing the UAPI, should we > > > > > >>> update the documentation of VHOST_SET_VRING_BASE and > > > > > >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h? > > > > > >> > > > > > >> Correct me if I'm wrong, but I don't think we're changing anything > > > > > >> in > > > > > >> the UAPI here, just fixing code to work correctly with what is > > > &
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
On Thu, May 18, 2023 at 3:41 PM Xuan Zhuo wrote: > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" > wrote: > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote: > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo > > > wrote: > > > > > > > > virtqueue_add_split() only supports virtual addresses, dma is completed > > > > in virtqueue_add_split(). > > > > > > > > In some scenarios (such as the AF_XDP scenario), the memory is allocated > > > > and DMA is completed in advance, so it is necessary for us to support > > > > passing the DMA address to virtqueue_add_split(). > > > > > > > > Record this information in desc_state, we can skip unmap based on this > > > > when executing dma unmap. > > > > > > I would also suggest documenting why a per descriptor metadata is > > > needed instead of a per virtqueue one. > > > > I think we could make it per virtqueue. That would mean all code in > > virtio net would have to change to do dma mapping itself instead of > > relying on virtio core though. Which is maybe a good idea? Definitely a > > very intrusive change though, will need a lot of performance testing > > to make sure we don't break anything. > > In fact, we have tried this idea. > > The problem is the detach and unmap. > > We need to get all DMA Addresses from virtio-ring to unmap. Currently, it does > not support to return the DMA Address, I'm not sure I got here, but we've already stored the DMA address in desc_extra? > and for SKB, we need to get multiple DMA > Addresses at one time. Could you elaborate on this? Thanks > > This need to modify the logic of Virtio-Ring detach. Besides this, I also > agree > with this idea. > > Thanks. > > > > > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo > > > > --- > > > > drivers/virtio/virtio_ring.c | 38 +++- > > > > 1 file changed, 29 insertions(+), 9 deletions(-) > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > > > index e2fc50c05bec..bd5e84afab37 100644 > > > > --- a/drivers/virtio/virtio_ring.c > > > > +++ b/drivers/virtio/virtio_ring.c > > > > @@ -70,6 +70,7 @@ > > > > struct vring_desc_state_split { > > > > void *data; /* Data for callback. */ > > > > struct vring_desc *indir_desc; /* Indirect descriptor, if any. > > > > */ > > > > + bool premapped; /* DMA mapping is done by > > > > driver. */ > > > > > > Going back to the original discussion around where this should be > > > placed. I wonder if we can find a common place to store this since it > > > has nothing related to virtqueue layout. Maybe desc_extra? And it > > > would be even better if we can avoid stressing the cache like above. > > > > > > > }; > > > > > > > > struct vring_desc_state_packed { > > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct > > > > vring_virtqueue *vq) > > > > > > > > /* Map one sg entry. */ > > > > static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > > > > scatterlist *sg, > > > > - enum dma_data_direction direction, static > > > > dma_addr_t *addr) > > > > + enum dma_data_direction direction, > > > > + bool premapped, dma_addr_t *addr) > > > > > > having things like: > > > > > > int func(bool do) > > > { > > > if (!do) > > > return; > > > } > > > > > > is a hint that the check needs to be done by the caller? > > > > > > And this change should work for both packed and split. I think we need > > > to squash the packed changes here. > > > > > > Looking at how packed virtqueue uses this in this patch, I don't think > > > this patch can even be built. I will wait for a new version and > > > continue the review from there. > > > > > > Thanks > > > > > > > > > > > > > { > > > > + if (premapped) { > > > > + *addr = sg_dma_address(sg); > > > > + return 0; > > > > +
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin wrote: > > On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote: > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" > > wrote: > > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote: > > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo > > > > wrote: > > > > > > > > > > virtqueue_add_split() only supports virtual addresses, dma is > > > > > completed > > > > > in virtqueue_add_split(). > > > > > > > > > > In some scenarios (such as the AF_XDP scenario), the memory is > > > > > allocated > > > > > and DMA is completed in advance, so it is necessary for us to support > > > > > passing the DMA address to virtqueue_add_split(). > > > > > > > > > > Record this information in desc_state, we can skip unmap based on this > > > > > when executing dma unmap. > > > > > > > > I would also suggest documenting why a per descriptor metadata is > > > > needed instead of a per virtqueue one. > > > > > > I think we could make it per virtqueue. That would mean all code in > > > virtio net would have to change to do dma mapping itself instead of > > > relying on virtio core though. Which is maybe a good idea? Definitely a > > > very intrusive change though, will need a lot of performance testing > > > to make sure we don't break anything. > > > > In fact, we have tried this idea. > > > > The problem is the detach and unmap. > > > > We need to get all DMA Addresses from virtio-ring to unmap. Currently, it > > does > > not support to return the DMA Address, and for SKB, we need to get multiple > > DMA > > Addresses at one time. > > > > This need to modify the logic of Virtio-Ring detach. Besides this, I also > > agree > > with this idea. > > > > Thanks. > > Well you can have a version of get_buf that returns them ... but > it is not clear to me all this is worth it unless you want > to do unsafe tricks like leaving them mapped. Some high speed NIC drivers use this trick for better performance. > I'd leave that > for another day maybe. > > For marking desc as premapped I think we can use a bit from > desc_extra->flags, either reusing one of NEXT,AVAIL,USED, or stealing > another one. Probably. Thanks > > > > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo > > > > > --- > > > > > drivers/virtio/virtio_ring.c | 38 > > > > > +++- > > > > > 1 file changed, 29 insertions(+), 9 deletions(-) > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c > > > > > b/drivers/virtio/virtio_ring.c > > > > > index e2fc50c05bec..bd5e84afab37 100644 > > > > > --- a/drivers/virtio/virtio_ring.c > > > > > +++ b/drivers/virtio/virtio_ring.c > > > > > @@ -70,6 +70,7 @@ > > > > > struct vring_desc_state_split { > > > > > void *data; /* Data for callback. */ > > > > > struct vring_desc *indir_desc; /* Indirect descriptor, if > > > > > any. */ > > > > > + bool premapped; /* DMA mapping is done by > > > > > driver. */ > > > > > > > > Going back to the original discussion around where this should be > > > > placed. I wonder if we can find a common place to store this since it > > > > has nothing related to virtqueue layout. Maybe desc_extra? And it > > > > would be even better if we can avoid stressing the cache like above. > > > > > > > > > }; > > > > > > > > > > struct vring_desc_state_packed { > > > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct > > > > > vring_virtqueue *vq) > > > > > > > > > > /* Map one sg entry. */ > > > > > static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > > > > > scatterlist *sg, > > > > > - enum dma_data_direction direction, static > > > > > dma_addr_t *addr) > > > > > + enum dma_data_direction direction, > > > > > +
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
On Thu, May 18, 2023 at 3:57 PM Xuan Zhuo wrote: > > On Thu, 18 May 2023 15:54:09 +0800, Jason Wang wrote: > > On Thu, May 18, 2023 at 3:41 PM Xuan Zhuo > > wrote: > > > > > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" > > > wrote: > > > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote: > > > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo > > > > > wrote: > > > > > > > > > > > > virtqueue_add_split() only supports virtual addresses, dma is > > > > > > completed > > > > > > in virtqueue_add_split(). > > > > > > > > > > > > In some scenarios (such as the AF_XDP scenario), the memory is > > > > > > allocated > > > > > > and DMA is completed in advance, so it is necessary for us to > > > > > > support > > > > > > passing the DMA address to virtqueue_add_split(). > > > > > > > > > > > > Record this information in desc_state, we can skip unmap based on > > > > > > this > > > > > > when executing dma unmap. > > > > > > > > > > I would also suggest documenting why a per descriptor metadata is > > > > > needed instead of a per virtqueue one. > > > > > > > > I think we could make it per virtqueue. That would mean all code in > > > > virtio net would have to change to do dma mapping itself instead of > > > > relying on virtio core though. Which is maybe a good idea? Definitely a > > > > very intrusive change though, will need a lot of performance testing > > > > to make sure we don't break anything. > > > > > > In fact, we have tried this idea. > > > > > > The problem is the detach and unmap. > > > > > > We need to get all DMA Addresses from virtio-ring to unmap. Currently, it > > > does > > > not support to return the DMA Address, > > > > I'm not sure I got here, but we've already stored the DMA address in > > desc_extra? > > > I mean we need to get the dma address from the virtio-core to virtio-net. > It probably just requires a new helper. Thanks > Thanks. > > > > > > > and for SKB, we need to get multiple DMA > > > Addresses at one time. > > > > Could you elaborate on this? > > > > Thanks > > > > > > > > This need to modify the logic of Virtio-Ring detach. Besides this, I also > > > agree > > > with this idea. > > > > > > Thanks. > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Signed-off-by: Xuan Zhuo > > > > > > --- > > > > > > drivers/virtio/virtio_ring.c | 38 > > > > > > +++- > > > > > > 1 file changed, 29 insertions(+), 9 deletions(-) > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c > > > > > > b/drivers/virtio/virtio_ring.c > > > > > > index e2fc50c05bec..bd5e84afab37 100644 > > > > > > --- a/drivers/virtio/virtio_ring.c > > > > > > +++ b/drivers/virtio/virtio_ring.c > > > > > > @@ -70,6 +70,7 @@ > > > > > > struct vring_desc_state_split { > > > > > > void *data; /* Data for callback. */ > > > > > > struct vring_desc *indir_desc; /* Indirect descriptor, if > > > > > > any. */ > > > > > > + bool premapped; /* DMA mapping is done by > > > > > > driver. */ > > > > > > > > > > Going back to the original discussion around where this should be > > > > > placed. I wonder if we can find a common place to store this since it > > > > > has nothing related to virtqueue layout. Maybe desc_extra? And it > > > > > would be even better if we can avoid stressing the cache like above. > > > > > > > > > > > }; > > > > > > > > > > > > struct vring_desc_state_packed { > > > > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const > > > > > > struct vring_virtqueue *vq) > > > > > > > > > > > > /* Map one sg entry. */ >
Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base
On Thu, May 18, 2023 at 4:38 PM Michael S. Tsirkin wrote: > > On Thu, May 18, 2023 at 03:52:10PM +0800, Jason Wang wrote: > > On Thu, May 18, 2023 at 3:34 PM Stefano Garzarella > > wrote: > > > > > > On Thu, May 18, 2023 at 7:24 AM Jason Wang wrote: > > > > > > > > On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella > > > > wrote: > > > > > > > > > > On Wed, May 17, 2023 at 7:26 AM Jason Wang > > > > > wrote: > > > > > > > > > > > > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson > > > > > > wrote: > > > > > > > > > > > > > > On 5/16/23 12:49 AM, Stefano Garzarella wrote: > > > > > > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote: > > > > > > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote: > > > > > > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via > > > > > > > >>> Virtualization wrote: > > > > > > > >>>> Use the right structs for PACKED or split vqs when setting > > > > > > > >>>> and > > > > > > > >>>> getting the vring base. > > > > > > > >>>> > > > > > > > >>>> Signed-off-by: Shannon Nelson > > > > > > > >>>> --- > > > > > > > >>>> drivers/vhost/vhost.c | 18 +- > > > > > > > >>>> drivers/vhost/vhost.h | 8 ++-- > > > > > > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-) > > > > > > > >>>> > > > > > > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > > > > > > > >>>> index f11bdbe4c2c5..f64efda48f21 100644 > > > > > > > >>>> --- a/drivers/vhost/vhost.c > > > > > > > >>>> +++ b/drivers/vhost/vhost.c > > > > > > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct > > > > > > > >>>> vhost_dev > > > > > > > >>>> *d, unsigned int ioctl, void __user *arg > > > > > > > >>>> r = -EFAULT; > > > > > > > >>>> break; > > > > > > > >>>> } > > > > > > > >>>> - if (s.num > 0x) { > > > > > > > >>>> - r = -EINVAL; > > > > > > > >>>> - break; > > > > > > > >>>> + if (vhost_has_feature(vq, > > > > > > > >>>> VIRTIO_F_RING_PACKED)) { > > > > > > > >>>> + vq->last_avail_idx = s.num & 0x; > > > > > > > >>>> + vq->last_used_idx = (s.num >> 16) & > > > > > > > >>>> 0x; > > > > > > > >>>> + } else { > > > > > > > >>>> + if (s.num > 0x) { > > > > > > > >>>> + r = -EINVAL; > > > > > > > >>>> + break; > > > > > > > >>>> + } > > > > > > > >>>> + vq->last_avail_idx = s.num; > > > > > > > >>>> } > > > > > > > >>>> - vq->last_avail_idx = s.num; > > > > > > > >>>> /* Forget the cached index value. */ > > > > > > > >>>> vq->avail_idx = vq->last_avail_idx; > > > > > > > >>>> break; > > > > > > > >>>> case VHOST_GET_VRING_BASE: > > > > > > > >>>> s.index = idx; > > > > > > > >>>> - s.num = vq->last_avail_idx; > > > > > > > >>>> + if (vhost_has_feature(vq, > > > > > > > >>>> VIRTIO_F_RING_PACKED)) > > > > &
Re: [RFC PATCH v2 3/3] PCI: endpoint: Add EP function driver to provide virtio-console functionality
On Thu, May 18, 2023 at 5:54 PM Shunsuke Mie wrote: > > Gentle ping ... > > > Thanks, > > Shunsuke. > > On 2023/05/10 12:17, Shunsuke Mie wrote: > > Hi Json, > > 2023年5月8日(月) 13:03 Jason Wang : > >> On Thu, Apr 27, 2023 at 6:44 PM Shunsuke Mie wrote: > >>> Add a new PCIe endpoint function driver that works as a pci virtio-console > >>> device. The console connect to endpoint side console. It enables to > >>> communicate PCIe host and endpoint. > >>> > >>> Architecture is following: > >>> > >>> ┌┐ ┌──┬┐ > >>> │virtioe │ │ │virtio │ > >>> │console drv │ ├───┐ │console drv │ > >>> ├┤ │(virtio console│ ├┤ > >>> │ virtio bus │ │ device) │◄►│ virtio bus │ > >>> ├┤ ├---┤ └┤ > >>> ││ │ pci ep virtio │ │ > >>> │ pci bus │ │ console drv │ │ > >>> ││ pcie ├───┤ │ > >>> ││ ◄─► │ pci ep Bus │ │ > >>> └┘ └───┴───┘ > >>> PCIe Root PCIe Endpoint > >>> > >> I think it might only works for peer devices like: > >> > >> net, console or vsock. > > Could you tell me what "peer devices" means? I meant, for example we know in the case of virtio-net, TX can talk with RX belonging to another device directly. But this is not the case for other devices like virito-blk. > > > >> So there're many choices here, I'd like to know what's the reason for > >> you to implement a mediation. > >> > >> An alternative is to implement a dedicated net, console and vsock > >> driver for vringh (CAIF somehow works like this). This would have > >> better performance. > > Does it mean that the driver also functions as a network driver directly? I meant, e.g in the case of networking, you can have a dedicated driver with two vringh in the endpoint side. The benefit is the performance, no need for the (datapath) mediation. But if we don't care about the performance, this proposal seems to be fine. Thanks > >> > >>> This driver has two roles. The first is as a PCIe endpoint virtio console > >>> function, which is implemented using the PCIe endpoint framework and PCIe > >>> EP virtio helpers. The second is as a virtual virtio console device > >>> connected to the virtio bus on PCIe endpoint Linux. > >>> > >>> Communication between the two is achieved by copying the virtqueue data > >>> between PCIe root and endpoint, respectively. > >>> > >>> This is a simple implementation and does not include features of > >>> virtio-console such as MULTIPORT, EMERG_WRITE, etc. As a result, each > >>> virtio console driver only displays /dev/hvc0. > >>> > >>> As an example of usage, by setting getty to /dev/hvc0, it is possible to > >>> login to another host. > >>> > >>> Signed-off-by: Shunsuke Mie > >>> --- > >>> Changes from v2: > >>> - Change to use copy functions between kiovs of pci-epf-virtio. > >>> > >>> drivers/pci/endpoint/functions/Kconfig| 12 + > >>> drivers/pci/endpoint/functions/Makefile | 1 + > >>> drivers/pci/endpoint/functions/pci-epf-vcon.c | 596 ++ > >>> 3 files changed, 609 insertions(+) > >>> create mode 100644 drivers/pci/endpoint/functions/pci-epf-vcon.c > >>> > >>> diff --git a/drivers/pci/endpoint/functions/Kconfig > >>> b/drivers/pci/endpoint/functions/Kconfig > >>> index fa1a6a569a8f..9ce2698b67e1 100644 > >>> --- a/drivers/pci/endpoint/functions/Kconfig > >>> +++ b/drivers/pci/endpoint/functions/Kconfig > >>> @@ -44,3 +44,15 @@ config PCI_EPF_VIRTIO > >>> select VHOST_RING_IOMEM > >>> help > >>>Helpers to implement PCI virtio Endpoint function > >>> + > >>> +config PCI_EPF_VCON > >>> + tristate "PCI Endpoint virito-console driver" > >>> + depends on PCI_ENDPOINT > >>> + select VHOST_RING >
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
在 2023/5/18 17:49, Michael S. Tsirkin 写道: On Thu, May 18, 2023 at 05:14:03PM +0800, Xuan Zhuo wrote: On Thu, 18 May 2023 16:57:37 +0800, Jason Wang wrote: On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin wrote: On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote: On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" wrote: On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote: On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo wrote: virtqueue_add_split() only supports virtual addresses, dma is completed in virtqueue_add_split(). In some scenarios (such as the AF_XDP scenario), the memory is allocated and DMA is completed in advance, so it is necessary for us to support passing the DMA address to virtqueue_add_split(). Record this information in desc_state, we can skip unmap based on this when executing dma unmap. I would also suggest documenting why a per descriptor metadata is needed instead of a per virtqueue one. I think we could make it per virtqueue. That would mean all code in virtio net would have to change to do dma mapping itself instead of relying on virtio core though. Which is maybe a good idea? Definitely a very intrusive change though, will need a lot of performance testing to make sure we don't break anything. In fact, we have tried this idea. The problem is the detach and unmap. We need to get all DMA Addresses from virtio-ring to unmap. Currently, it does not support to return the DMA Address, and for SKB, we need to get multiple DMA Addresses at one time. This need to modify the logic of Virtio-Ring detach. Besides this, I also agree with this idea. Thanks. Well you can have a version of get_buf that returns them ... but it is not clear to me all this is worth it unless you want to do unsafe tricks like leaving them mapped. Some high speed NIC drivers use this trick for better performance. Interesting, this is the first time I know this. Is there any problem? depends - if you are relying on the IOMMU then yes - malicious hardware can steal guest secrets or corrupt memory since it's a hack not properly integrated with linux and there's no real control preventing linux from reusing this memory for something unrelated. The pages are pre-allocated/mapped buffers for RX. So it should be fine. Thanks If instead you are using something like bounce buffers then no, but OTOH bounce buffers are already expensive so you might not see a lot of benefit. So, is that virtio-net master the operation of dma by itself the right way? Thanks I am fine with the approach taken for now. And look at reducing cost of dma map/unmap later. I'd leave that for another day maybe. For marking desc as premapped I think we can use a bit from desc_extra->flags, either reusing one of NEXT,AVAIL,USED, or stealing another one. Probably. Thanks Signed-off-by: Xuan Zhuo --- drivers/virtio/virtio_ring.c | 38 +++- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index e2fc50c05bec..bd5e84afab37 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -70,6 +70,7 @@ struct vring_desc_state_split { void *data; /* Data for callback. */ struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ + bool premapped; /* DMA mapping is done by driver. */ Going back to the original discussion around where this should be placed. I wonder if we can find a common place to store this since it has nothing related to virtqueue layout. Maybe desc_extra? And it would be even better if we can avoid stressing the cache like above. }; struct vring_desc_state_packed { @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, - enum dma_data_direction direction, static dma_addr_t *addr) + enum dma_data_direction direction, + bool premapped, dma_addr_t *addr) having things like: int func(bool do) { if (!do) return; } is a hint that the check needs to be done by the caller? And this change should work for both packed and split. I think we need to squash the packed changes here. Looking at how packed virtqueue uses this in this patch, I don't think this patch can even be built. I will wait for a new version and continue the review from there. Thanks { + if (premapped) { + *addr = sg_dma_address(sg); + return 0; + } + if (!vq->use_dma_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist @@ -445,7 +452,7 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, } s
Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped
On Fri, May 19, 2023 at 11:33 AM Xuan Zhuo wrote: > > On Thu, 18 May 2023 13:12:49 -0400, "Michael S. Tsirkin" > wrote: > > On Thu, May 18, 2023 at 08:22:14PM +0800, Xuan Zhuo wrote: > > > On Thu, 18 May 2023 05:49:46 -0400, "Michael S. Tsirkin" > > > wrote: > > > > On Thu, May 18, 2023 at 05:14:03PM +0800, Xuan Zhuo wrote: > > > > > On Thu, 18 May 2023 16:57:37 +0800, Jason Wang > > > > > wrote: > > > > > > On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin > > > > > > wrote: > > > > > > > > > > > > > > On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote: > > > > > > > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" > > > > > > > > wrote: > > > > > > > > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote: > > > > > > > > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo > > > > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > > > virtqueue_add_split() only supports virtual addresses, > > > > > > > > > > > dma is completed > > > > > > > > > > > in virtqueue_add_split(). > > > > > > > > > > > > > > > > > > > > > > In some scenarios (such as the AF_XDP scenario), the > > > > > > > > > > > memory is allocated > > > > > > > > > > > and DMA is completed in advance, so it is necessary for > > > > > > > > > > > us to support > > > > > > > > > > > passing the DMA address to virtqueue_add_split(). > > > > > > > > > > > > > > > > > > > > > > Record this information in desc_state, we can skip unmap > > > > > > > > > > > based on this > > > > > > > > > > > when executing dma unmap. > > > > > > > > > > > > > > > > > > > > I would also suggest documenting why a per descriptor > > > > > > > > > > metadata is > > > > > > > > > > needed instead of a per virtqueue one. > > > > > > > > > > > > > > > > > > I think we could make it per virtqueue. That would mean all > > > > > > > > > code in > > > > > > > > > virtio net would have to change to do dma mapping itself > > > > > > > > > instead of > > > > > > > > > relying on virtio core though. Which is maybe a good idea? > > > > > > > > > Definitely a > > > > > > > > > very intrusive change though, will need a lot of performance > > > > > > > > > testing > > > > > > > > > to make sure we don't break anything. > > > > > > > > > > > > > > > > In fact, we have tried this idea. > > > > > > > > > > > > > > > > The problem is the detach and unmap. > > > > > > > > > > > > > > > > We need to get all DMA Addresses from virtio-ring to unmap. > > > > > > > > Currently, it does > > > > > > > > not support to return the DMA Address, and for SKB, we need to > > > > > > > > get multiple DMA > > > > > > > > Addresses at one time. > > > > > > > > > > > > > > > > This need to modify the logic of Virtio-Ring detach. Besides > > > > > > > > this, I also agree > > > > > > > > with this idea. > > > > > > > > > > > > > > > > Thanks. > > > > > > > > > > > > > > Well you can have a version of get_buf that returns them ... but > > > > > > > it is not clear to me all this is worth it unless you want > > > > > > > to do unsafe tricks like leaving them mapped. > > > > > > > > > > > > Some high speed NIC drivers use this trick for better performance. > > > > > > > > > > > > > > > Interesting, this is the first time I know this. Is there any problem? > &
Re: [PATCH V2 1/5] vDPA/ifcvf: virt queue ops take immediate actions
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan wrote: > > In this commit, virtqueue operations including: > set_vq_num(), set_vq_address(), set_vq_ready() > and get_vq_ready() access PCI registers directly > to take immediate actions. > > Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Thanks > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 58 - > drivers/vdpa/ifcvf/ifcvf_base.h | 10 +++--- > drivers/vdpa/ifcvf/ifcvf_main.c | 16 +++-- > 3 files changed, 45 insertions(+), 39 deletions(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index 5563b3a773c7..6c5650f73007 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -329,31 +329,49 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, > u16 num) > return 0; > } > > -static int ifcvf_hw_enable(struct ifcvf_hw *hw) > +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num) > { > - struct virtio_pci_common_cfg __iomem *cfg; > - u32 i; > + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > > - cfg = hw->common_cfg; > - for (i = 0; i < hw->nr_vring; i++) { > - if (!hw->vring[i].ready) > - break; > + vp_iowrite16(qid, &cfg->queue_select); > + vp_iowrite16(num, &cfg->queue_size); > +} > > - vp_iowrite16(i, &cfg->queue_select); > - vp_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo, > -&cfg->queue_desc_hi); > - vp_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo, > - &cfg->queue_avail_hi); > - vp_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo, > -&cfg->queue_used_hi); > - vp_iowrite16(hw->vring[i].size, &cfg->queue_size); > - ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx); > - vp_iowrite16(1, &cfg->queue_enable); > - } > +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area, > +u64 driver_area, u64 device_area) > +{ > + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > + > + vp_iowrite16(qid, &cfg->queue_select); > + vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo, > +&cfg->queue_desc_hi); > + vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo, > +&cfg->queue_avail_hi); > + vp_iowrite64_twopart(device_area, &cfg->queue_used_lo, > +&cfg->queue_used_hi); > > return 0; > } > > +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid) > +{ > + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > + u16 queue_enable; > + > + vp_iowrite16(qid, &cfg->queue_select); > + queue_enable = vp_ioread16(&cfg->queue_enable); > + > + return (bool)queue_enable; > +} > + > +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready) > +{ > + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > + > + vp_iowrite16(qid, &cfg->queue_select); > + vp_iowrite16(ready, &cfg->queue_enable); > +} > + > static void ifcvf_hw_disable(struct ifcvf_hw *hw) > { > u32 i; > @@ -366,16 +384,12 @@ static void ifcvf_hw_disable(struct ifcvf_hw *hw) > > int ifcvf_start_hw(struct ifcvf_hw *hw) > { > - ifcvf_reset(hw); > ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE); > ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER); > > if (ifcvf_config_features(hw) < 0) > return -EINVAL; > > - if (ifcvf_hw_enable(hw) < 0) > - return -EINVAL; > - > ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK); > > return 0; > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index c20d1c40214e..d545a9411143 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -47,12 +47,7 @@ > #define MSIX_VECTOR_DEV_SHARED 3 > > struct vring_info { > - u64 desc; > - u64 avail; > - u64 used; > - u16 size; > u16 last_avail_idx; > - bool ready; > void __iomem *notify_addr; > phys_addr_t notify_pa; > u32 irq; > @@ -137,4 +132,9 @@ int ifcvf_probed_virtio_
Re: [PATCH V2 2/5] vDPA/ifcvf: get_driver_features from virtio registers
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan wrote: > > This commit implements a new function ifcvf_get_driver_feature() > which read driver_features from virtio registers. > > To be less ambiguous, ifcvf_set_features() is renamed to > ifcvf_set_driver_features(), and ifcvf_get_features() > is renamed to ifcvf_get_dev_features() which returns > the provisioned vDPA device features. > > Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Thanks > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 38 + > drivers/vdpa/ifcvf/ifcvf_base.h | 5 +++-- > drivers/vdpa/ifcvf/ifcvf_main.c | 9 +--- > 3 files changed, 29 insertions(+), 23 deletions(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index 6c5650f73007..546e923bcd16 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -204,11 +204,29 @@ u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) > return features; > } > > -u64 ifcvf_get_features(struct ifcvf_hw *hw) > +/* return provisioned vDPA dev features */ > +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw) > { > return hw->dev_features; > } > > +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw) > +{ > + struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > + u32 features_lo, features_hi; > + u64 features; > + > + vp_iowrite32(0, &cfg->device_feature_select); > + features_lo = vp_ioread32(&cfg->guest_feature); > + > + vp_iowrite32(1, &cfg->device_feature_select); > + features_hi = vp_ioread32(&cfg->guest_feature); > + > + features = ((u64)features_hi << 32) | features_lo; > + > + return features; > +} > + > int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) > { > if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { > @@ -275,7 +293,7 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 > offset, > vp_iowrite8(*p++, hw->dev_cfg + offset + i); > } > > -static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features) > +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features) > { > struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; > > @@ -286,19 +304,6 @@ static void ifcvf_set_features(struct ifcvf_hw *hw, u64 > features) > vp_iowrite32(features >> 32, &cfg->guest_feature); > } > > -static int ifcvf_config_features(struct ifcvf_hw *hw) > -{ > - ifcvf_set_features(hw, hw->req_features); > - ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK); > - > - if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) { > - IFCVF_ERR(hw->pdev, "Failed to set FEATURES_OK status\n"); > - return -EIO; > - } > - > - return 0; > -} > - > u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid) > { > struct ifcvf_lm_cfg __iomem *ifcvf_lm; > @@ -387,9 +392,6 @@ int ifcvf_start_hw(struct ifcvf_hw *hw) > ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE); > ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER); > > - if (ifcvf_config_features(hw) < 0) > - return -EINVAL; > - > ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK); > > return 0; > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index d545a9411143..cb19196c3ece 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -69,7 +69,6 @@ struct ifcvf_hw { > phys_addr_t notify_base_pa; > u32 notify_off_multiplier; > u32 dev_type; > - u64 req_features; > u64 hw_features; > /* provisioned device features */ > u64 dev_features; > @@ -122,7 +121,7 @@ u8 ifcvf_get_status(struct ifcvf_hw *hw); > void ifcvf_set_status(struct ifcvf_hw *hw, u8 status); > void io_write64_twopart(u64 val, u32 *lo, u32 *hi); > void ifcvf_reset(struct ifcvf_hw *hw); > -u64 ifcvf_get_features(struct ifcvf_hw *hw); > +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw); > u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); > int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); > u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); > @@ -137,4 +136,6 @@ int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, > u64 desc_area, > u64 driver_area, u64 device_area); > bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid); > void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready); > +void ifcvf_set_driver_features(struct ifc
Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan wrote: > > This commit synchronize irqs of the virtqueues > and config space in the reset routine. > Thus ifcvf_stop_hw() and reset() are refactored as well. > > Signed-off-by: Zhu Lingshan > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 41 + > drivers/vdpa/ifcvf/ifcvf_base.h | 1 + > drivers/vdpa/ifcvf/ifcvf_main.c | 46 + > 3 files changed, 38 insertions(+), 50 deletions(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index 79e313c5e10e..1f39290baa38 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status) > > void ifcvf_reset(struct ifcvf_hw *hw) > { > - hw->config_cb.callback = NULL; > - hw->config_cb.private = NULL; > - > ifcvf_set_status(hw, 0); > - /* flush set_status, make sure VF is stopped, reset */ > - ifcvf_get_status(hw); > + while (ifcvf_get_status(hw)) > + msleep(1); > } > > u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) > @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, > bool ready) > vp_iowrite16(ready, &cfg->queue_enable); > } > > -static void ifcvf_hw_disable(struct ifcvf_hw *hw) > +static void ifcvf_reset_vring(struct ifcvf_hw *hw) > { > - u32 i; > + u16 qid; > + > + for (qid = 0; qid < hw->nr_vring; qid++) { > + hw->vring[qid].cb.callback = NULL; > + hw->vring[qid].cb.private = NULL; > + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); > + } > +} > > +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) > +{ > + hw->config_cb.callback = NULL; > + hw->config_cb.private = NULL; > ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); > - for (i = 0; i < hw->nr_vring; i++) { > - ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR); > +} > + > +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) > +{ > + u32 nvectors = hw->num_msix_vectors; > + struct pci_dev *pdev = hw->pdev; > + int i, irq; > + > + for (i = 0; i < nvectors; i++) { > + irq = pci_irq_vector(pdev, i); > + if (irq >= 0) > + synchronize_irq(irq); > } > } > > void ifcvf_stop_hw(struct ifcvf_hw *hw) > { > - ifcvf_hw_disable(hw); > - ifcvf_reset(hw); > + ifcvf_synchronize_irq(hw); > + ifcvf_reset_vring(hw); > + ifcvf_reset_config_handler(hw); Nit: So the name of this function is kind of misleading since irq synchronization and virtqueue/config handler are not belong to hardware? Maybe it would be better to call it ifcvf_stop(). Thanks > } > > void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index d34d3bc0dbf4..7430f80779be 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -82,6 +82,7 @@ struct ifcvf_hw { > int vqs_reused_irq; > u16 nr_vring; > /* VIRTIO_PCI_CAP_DEVICE_CFG size */ > + u32 num_msix_vectors; > u32 cap_dev_config_size; > struct pci_dev *pdev; > }; > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c > index 968687159e44..3401b9901dd2 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_main.c > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > @@ -125,6 +125,7 @@ static void ifcvf_free_irq(struct ifcvf_hw *vf) > ifcvf_free_vq_irq(vf); > ifcvf_free_config_irq(vf); > ifcvf_free_irq_vectors(pdev); > + vf->num_msix_vectors = 0; > } > > /* ifcvf MSIX vectors allocator, this helper tries to allocate > @@ -343,36 +344,11 @@ static int ifcvf_request_irq(struct ifcvf_hw *vf) > if (ret) > return ret; > > - return 0; > -} > - > -static int ifcvf_stop_datapath(struct ifcvf_adapter *adapter) > -{ > - struct ifcvf_hw *vf = adapter->vf; > - int i; > - > - for (i = 0; i < vf->nr_vring; i++) > - vf->vring[i].cb.callback = NULL; > - > - ifcvf_stop_hw(vf); > + vf->num_msix_vectors = nvectors; > > return 0; > } > > -static void ifcvf_reset_vring(struct ifcvf_adapter *adapter) > -{ > - struct ifcvf_hw *vf = adapter->vf; > - int i; > - > - for (i = 0; i < vf->nr_vring; i++) { > - vf->vring[i].last_avail_idx = 0; > - vf->vring[i].cb.callback = NULL; > - vf->vring[i].cb.private = NULL; > - } > - > - ifcvf_reset(vf); > -} > - > static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev) > { > return container_of(vdpa_dev, struct ifcvf_adapter, vdpa); > @@ -462,23 +438,15 @@ static void ifcvf_vdpa_set_status(struct vdpa_device > *vdpa_dev, u8 status) > > static int ifcvf_vdpa_reset(stru
[PATCH V3 net-next 0/2] virtio-net: don't busy poll for cvq command
Hi all: The code used to busy poll for cvq command which turns out to have several side effects: 1) infinite poll for buggy devices 2) bad interaction with scheduler So this series tries to use cond_resched() in the waiting loop. Before doing this we need first make sure the cvq command is not executed in atomic environment, so we need first convert rx mode handling to a workqueue. Please review. Thanks Changes since V2: - Don't use interrupt but cond_resched() Changes since V1: - use RTNL to synchronize rx mode worker - use completion for simplicity - don't try to harden CVQ command Changes since RFC: - switch to use BAD_RING in virtio_break_device() - check virtqueue_is_broken() after being woken up - use more_used() instead of virtqueue_get_buf() to allow caller to get buffers afterwards - break the virtio-net device when timeout - get buffer manually since the virtio core check more_used() instead Jason Wang (2): virtio-net: convert rx mode setting to use workqueue virtio-net: add cond_resched() to the command waiting loop drivers/net/virtio_net.c | 59 +--- 1 file changed, 55 insertions(+), 4 deletions(-) -- 2.25.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue
This patch convert rx mode setting to be done in a workqueue, this is a must for allow to sleep when waiting for the cvq command to response since current code is executed under addr spin lock. Signed-off-by: Jason Wang --- Changes since V1: - use RTNL to synchronize rx mode worker --- drivers/net/virtio_net.c | 55 +--- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 56ca1d270304..5d2f1da4eaa0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -265,6 +265,12 @@ struct virtnet_info { /* Work struct for config space updates */ struct work_struct config_work; + /* Work struct for config rx mode */ + struct work_struct rx_mode_work; + + /* Is rx mode work enabled? */ + bool rx_mode_work_enabled; + /* Does the affinity hint is set for virtqueues? */ bool affinity_hint_set; @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct virtnet_info *vi) spin_unlock_bh(&vi->refill_lock); } +static void enable_rx_mode_work(struct virtnet_info *vi) +{ + rtnl_lock(); + vi->rx_mode_work_enabled = true; + rtnl_unlock(); +} + +static void disable_rx_mode_work(struct virtnet_info *vi) +{ + rtnl_lock(); + vi->rx_mode_work_enabled = false; + rtnl_unlock(); +} + static void virtqueue_napi_schedule(struct napi_struct *napi, struct virtqueue *vq) { @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev) return 0; } -static void virtnet_set_rx_mode(struct net_device *dev) +static void virtnet_rx_mode_work(struct work_struct *work) { - struct virtnet_info *vi = netdev_priv(dev); + struct virtnet_info *vi = + container_of(work, struct virtnet_info, rx_mode_work); + struct net_device *dev = vi->dev; struct scatterlist sg[2]; struct virtio_net_ctrl_mac *mac_data; struct netdev_hw_addr *ha; @@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device *dev) if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) return; + rtnl_lock(); + vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); @@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct net_device *dev) dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", vi->ctrl->allmulti ? "en" : "dis"); + netif_addr_lock_bh(dev); + uc_count = netdev_uc_count(dev); mc_count = netdev_mc_count(dev); /* MAC filter - use one buffer for both lists */ buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + (2 * sizeof(mac_data->entries)), GFP_ATOMIC); mac_data = buf; - if (!buf) + if (!buf) { + netif_addr_unlock_bh(dev); + rtnl_unlock(); return; + } sg_init_table(sg, 2); @@ -2401,6 +2430,8 @@ static void virtnet_set_rx_mode(struct net_device *dev) netdev_for_each_mc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); + netif_addr_unlock_bh(dev); + sg_set_buf(&sg[1], mac_data, sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); @@ -2408,9 +2439,19 @@ static void virtnet_set_rx_mode(struct net_device *dev) VIRTIO_NET_CTRL_MAC_TABLE_SET, sg)) dev_warn(&dev->dev, "Failed to set MAC filter table.\n"); + rtnl_unlock(); + kfree(buf); } +static void virtnet_set_rx_mode(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + + if (vi->rx_mode_work_enabled) + schedule_work(&vi->rx_mode_work); +} + static int virtnet_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { @@ -3181,6 +3222,8 @@ static void virtnet_freeze_down(struct virtio_device *vdev) /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); + disable_rx_mode_work(vi); + flush_work(&vi->rx_mode_work); netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); @@ -3203,6 +3246,7 @@ static int virtnet_restore_up(struct virtio_device *vdev) virtio_device_ready(vdev); enable_delayed_refill(vi); + enable_rx_mode_work(vi); if (netif_running(vi->dev)) { err = virtnet_open(vi->dev); @@ -4002,6 +4046,7 @@ static int virtnet_probe(struct virtio_device *vdev) vdev->priv = vi; INIT_WORK(&vi->config_work, virtnet_config_changed_work); +
[PATCH V3 net-next 2/2] virtio-net: add cond_resched() to the command waiting loop
Adding cond_resched() to the command waiting loop for a better co-operation with the scheduler. This allows to give CPU a breath to run other task(workqueue) instead of busy looping when preemption is not allowed on a device whose CVQ might be slow. Signed-off-by: Jason Wang --- drivers/net/virtio_net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5d2f1da4eaa0..de498dbbf0d4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2207,8 +2207,10 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, * into the hypervisor, so the request should be handled immediately. */ while (!virtqueue_get_buf(vi->cvq, &tmp) && - !virtqueue_is_broken(vi->cvq)) + !virtqueue_is_broken(vi->cvq)) { + cond_resched(); cpu_relax(); + } return vi->ctrl->status == VIRTIO_NET_OK; } -- 2.25.1 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine
On Wed, May 24, 2023 at 4:03 PM Jason Wang wrote: > > On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan wrote: > > > > This commit synchronize irqs of the virtqueues > > and config space in the reset routine. > > Thus ifcvf_stop_hw() and reset() are refactored as well. > > > > Signed-off-by: Zhu Lingshan > > --- > > drivers/vdpa/ifcvf/ifcvf_base.c | 41 + > > drivers/vdpa/ifcvf/ifcvf_base.h | 1 + > > drivers/vdpa/ifcvf/ifcvf_main.c | 46 + > > 3 files changed, 38 insertions(+), 50 deletions(-) > > > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c > > b/drivers/vdpa/ifcvf/ifcvf_base.c > > index 79e313c5e10e..1f39290baa38 100644 > > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > > @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status) > > > > void ifcvf_reset(struct ifcvf_hw *hw) > > { > > - hw->config_cb.callback = NULL; > > - hw->config_cb.private = NULL; > > - > > ifcvf_set_status(hw, 0); > > - /* flush set_status, make sure VF is stopped, reset */ > > - ifcvf_get_status(hw); > > + while (ifcvf_get_status(hw)) > > + msleep(1); > > } > > > > u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) > > @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, > > bool ready) > > vp_iowrite16(ready, &cfg->queue_enable); > > } > > > > -static void ifcvf_hw_disable(struct ifcvf_hw *hw) > > +static void ifcvf_reset_vring(struct ifcvf_hw *hw) > > { > > - u32 i; > > + u16 qid; > > + > > + for (qid = 0; qid < hw->nr_vring; qid++) { > > + hw->vring[qid].cb.callback = NULL; > > + hw->vring[qid].cb.private = NULL; > > + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); > > + } > > +} > > > > +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) > > +{ > > + hw->config_cb.callback = NULL; > > + hw->config_cb.private = NULL; > > ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); > > - for (i = 0; i < hw->nr_vring; i++) { > > - ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR); > > +} > > + > > +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) > > +{ > > + u32 nvectors = hw->num_msix_vectors; > > + struct pci_dev *pdev = hw->pdev; > > + int i, irq; > > + > > + for (i = 0; i < nvectors; i++) { > > + irq = pci_irq_vector(pdev, i); > > + if (irq >= 0) > > + synchronize_irq(irq); > > } > > } > > > > void ifcvf_stop_hw(struct ifcvf_hw *hw) > > { > > - ifcvf_hw_disable(hw); > > - ifcvf_reset(hw); > > + ifcvf_synchronize_irq(hw); > > + ifcvf_reset_vring(hw); > > + ifcvf_reset_config_handler(hw); > > Nit: > > So the name of this function is kind of misleading since irq > synchronization and virtqueue/config handler are not belong to > hardware? > > Maybe it would be better to call it ifcvf_stop(). I think we can tweak this on top. So Acked-by: Jason Wang Thanks > > Thanks > > > } > > > > void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h > > b/drivers/vdpa/ifcvf/ifcvf_base.h > > index d34d3bc0dbf4..7430f80779be 100644 > > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > > @@ -82,6 +82,7 @@ struct ifcvf_hw { > > int vqs_reused_irq; > > u16 nr_vring; > > /* VIRTIO_PCI_CAP_DEVICE_CFG size */ > > + u32 num_msix_vectors; > > u32 cap_dev_config_size; > > struct pci_dev *pdev; > > }; > > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c > > b/drivers/vdpa/ifcvf/ifcvf_main.c > > index 968687159e44..3401b9901dd2 100644 > > --- a/drivers/vdpa/ifcvf/ifcvf_main.c > > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > > @@ -125,6 +125,7 @@ static void ifcvf_free_irq(struct ifcvf_hw *vf) > > ifcvf_free_vq_irq(vf); > > ifcvf_free_config_irq(vf); > > ifcvf_free_irq_vectors(pdev); > > + vf->num_msix_vectors = 0; > > } > > > > /* ifcvf MSIX vectors allocator, this helper tries to allocate > > @@ -343,36 +344,11 @@ static
Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue
On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin wrote: > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote: > > This patch convert rx mode setting to be done in a workqueue, this is > > a must for allow to sleep when waiting for the cvq command to > > response since current code is executed under addr spin lock. > > > > Signed-off-by: Jason Wang > > --- > > Changes since V1: > > - use RTNL to synchronize rx mode worker > > --- > > drivers/net/virtio_net.c | 55 +--- > > 1 file changed, 52 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 56ca1d270304..5d2f1da4eaa0 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -265,6 +265,12 @@ struct virtnet_info { > > /* Work struct for config space updates */ > > struct work_struct config_work; > > > > + /* Work struct for config rx mode */ > > With a bit less abbreviation maybe? setting rx mode? That's fine. > > > + struct work_struct rx_mode_work; > > + > > + /* Is rx mode work enabled? */ > > Ugh not a great comment. Any suggestions for this. E.g we had: /* Is delayed refill enabled? */ > > > + bool rx_mode_work_enabled; > > + > > > > > /* Does the affinity hint is set for virtqueues? */ > > bool affinity_hint_set; > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct virtnet_info > > *vi) > > spin_unlock_bh(&vi->refill_lock); > > } > > > > +static void enable_rx_mode_work(struct virtnet_info *vi) > > +{ > > + rtnl_lock(); > > + vi->rx_mode_work_enabled = true; > > + rtnl_unlock(); > > +} > > + > > +static void disable_rx_mode_work(struct virtnet_info *vi) > > +{ > > + rtnl_lock(); > > + vi->rx_mode_work_enabled = false; > > + rtnl_unlock(); > > +} > > + > > static void virtqueue_napi_schedule(struct napi_struct *napi, > > struct virtqueue *vq) > > { > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev) > > return 0; > > } > > > > -static void virtnet_set_rx_mode(struct net_device *dev) > > +static void virtnet_rx_mode_work(struct work_struct *work) > > { > > - struct virtnet_info *vi = netdev_priv(dev); > > + struct virtnet_info *vi = > > + container_of(work, struct virtnet_info, rx_mode_work); > > + struct net_device *dev = vi->dev; > > struct scatterlist sg[2]; > > struct virtio_net_ctrl_mac *mac_data; > > struct netdev_hw_addr *ha; > > @@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device > > *dev) > > if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) > > return; > > > > + rtnl_lock(); > > + > > vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); > > vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); > > > > @@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct net_device > > *dev) > > dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", > >vi->ctrl->allmulti ? "en" : "dis"); > > > > + netif_addr_lock_bh(dev); > > + > > uc_count = netdev_uc_count(dev); > > mc_count = netdev_mc_count(dev); > > /* MAC filter - use one buffer for both lists */ > > buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + > > (2 * sizeof(mac_data->entries)), GFP_ATOMIC); > > mac_data = buf; > > - if (!buf) > > + if (!buf) { > > + netif_addr_unlock_bh(dev); > > + rtnl_unlock(); > > return; > > + } > > > > sg_init_table(sg, 2); > > > > @@ -2401,6 +2430,8 @@ static void virtnet_set_rx_mode(struct net_device > > *dev) > > netdev_for_each_mc_addr(ha, dev) > > memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); > > > > + netif_addr_unlock_bh(dev); > > + > > sg_set_buf(&sg[1], mac_data, > > sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); > > > > @@ -2408,9 +2439,19 @@ static void virtnet_set_rx_mode(struct net_device > > *dev) > > V
Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue
On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin wrote: > > On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote: > > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin wrote: > > > > > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote: > > > > This patch convert rx mode setting to be done in a workqueue, this is > > > > a must for allow to sleep when waiting for the cvq command to > > > > response since current code is executed under addr spin lock. > > > > > > > > Signed-off-by: Jason Wang > > > > --- > > > > Changes since V1: > > > > - use RTNL to synchronize rx mode worker > > > > --- > > > > drivers/net/virtio_net.c | 55 +--- > > > > 1 file changed, 52 insertions(+), 3 deletions(-) > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > index 56ca1d270304..5d2f1da4eaa0 100644 > > > > --- a/drivers/net/virtio_net.c > > > > +++ b/drivers/net/virtio_net.c > > > > @@ -265,6 +265,12 @@ struct virtnet_info { > > > > /* Work struct for config space updates */ > > > > struct work_struct config_work; > > > > > > > > + /* Work struct for config rx mode */ > > > > > > With a bit less abbreviation maybe? setting rx mode? > > > > That's fine. > > > > > > > > > + struct work_struct rx_mode_work; > > > > + > > > > + /* Is rx mode work enabled? */ > > > > > > Ugh not a great comment. > > > > Any suggestions for this. E.g we had: > > > > /* Is delayed refill enabled? */ > > /* OK to queue work setting RX mode? */ Ok. > > > > > > > > > + bool rx_mode_work_enabled; > > > > + > > > > > > > > > > > > > /* Does the affinity hint is set for virtqueues? */ > > > > bool affinity_hint_set; > > > > > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct > > > > virtnet_info *vi) > > > > spin_unlock_bh(&vi->refill_lock); > > > > } > > > > > > > > +static void enable_rx_mode_work(struct virtnet_info *vi) > > > > +{ > > > > + rtnl_lock(); > > > > + vi->rx_mode_work_enabled = true; > > > > + rtnl_unlock(); > > > > +} > > > > + > > > > +static void disable_rx_mode_work(struct virtnet_info *vi) > > > > +{ > > > > + rtnl_lock(); > > > > + vi->rx_mode_work_enabled = false; > > > > + rtnl_unlock(); > > > > +} > > > > + > > > > static void virtqueue_napi_schedule(struct napi_struct *napi, > > > > struct virtqueue *vq) > > > > { > > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev) > > > > return 0; > > > > } > > > > > > > > -static void virtnet_set_rx_mode(struct net_device *dev) > > > > +static void virtnet_rx_mode_work(struct work_struct *work) > > > > { > > > > - struct virtnet_info *vi = netdev_priv(dev); > > > > + struct virtnet_info *vi = > > > > + container_of(work, struct virtnet_info, rx_mode_work); > > > > + struct net_device *dev = vi->dev; > > > > struct scatterlist sg[2]; > > > > struct virtio_net_ctrl_mac *mac_data; > > > > struct netdev_hw_addr *ha; > > > > @@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device > > > > *dev) > > > > if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) > > > > return; > > > > > > > > + rtnl_lock(); > > > > + > > > > vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); > > > > vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); > > > > > > > > @@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct > > > > net_device *dev) > > > > dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", > > > >vi->ctrl->allmulti ? "en" : "dis"); > > > > > > > > + netif_addr_lock_b
Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine
On Thu, May 25, 2023 at 5:38 PM Zhu, Lingshan wrote: > > > > On 5/24/2023 4:03 PM, Jason Wang wrote: > > On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan wrote: > >> This commit synchronize irqs of the virtqueues > >> and config space in the reset routine. > >> Thus ifcvf_stop_hw() and reset() are refactored as well. > >> > >> Signed-off-by: Zhu Lingshan > >> --- > >> drivers/vdpa/ifcvf/ifcvf_base.c | 41 + > >> drivers/vdpa/ifcvf/ifcvf_base.h | 1 + > >> drivers/vdpa/ifcvf/ifcvf_main.c | 46 + > >> 3 files changed, 38 insertions(+), 50 deletions(-) > >> > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c > >> b/drivers/vdpa/ifcvf/ifcvf_base.c > >> index 79e313c5e10e..1f39290baa38 100644 > >> --- a/drivers/vdpa/ifcvf/ifcvf_base.c > >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > >> @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status) > >> > >> void ifcvf_reset(struct ifcvf_hw *hw) > >> { > >> - hw->config_cb.callback = NULL; > >> - hw->config_cb.private = NULL; > >> - > >> ifcvf_set_status(hw, 0); > >> - /* flush set_status, make sure VF is stopped, reset */ > >> - ifcvf_get_status(hw); > >> + while (ifcvf_get_status(hw)) > >> + msleep(1); > >> } > >> > >> u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) > >> @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 > >> qid, bool ready) > >> vp_iowrite16(ready, &cfg->queue_enable); > >> } > >> > >> -static void ifcvf_hw_disable(struct ifcvf_hw *hw) > >> +static void ifcvf_reset_vring(struct ifcvf_hw *hw) > >> { > >> - u32 i; > >> + u16 qid; > >> + > >> + for (qid = 0; qid < hw->nr_vring; qid++) { > >> + hw->vring[qid].cb.callback = NULL; > >> + hw->vring[qid].cb.private = NULL; > >> + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); > >> + } > >> +} > >> > >> +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) > >> +{ > >> + hw->config_cb.callback = NULL; > >> + hw->config_cb.private = NULL; > >> ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); > >> - for (i = 0; i < hw->nr_vring; i++) { > >> - ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR); > >> +} > >> + > >> +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) > >> +{ > >> + u32 nvectors = hw->num_msix_vectors; > >> + struct pci_dev *pdev = hw->pdev; > >> + int i, irq; > >> + > >> + for (i = 0; i < nvectors; i++) { > >> + irq = pci_irq_vector(pdev, i); > >> + if (irq >= 0) > >> + synchronize_irq(irq); > >> } > >> } > >> > >> void ifcvf_stop_hw(struct ifcvf_hw *hw) > >> { > >> - ifcvf_hw_disable(hw); > >> - ifcvf_reset(hw); > >> + ifcvf_synchronize_irq(hw); > >> + ifcvf_reset_vring(hw); > >> + ifcvf_reset_config_handler(hw); > > Nit: > > > > So the name of this function is kind of misleading since irq > > synchronization and virtqueue/config handler are not belong to > > hardware? > > > > Maybe it would be better to call it ifcvf_stop(). > Sure, I will send a V3 with this renaming, > do you ack patch 1/5? Yes, I think I've acked to that patch. Thanks > > Thanks > Zhu Lingshan > > > > Thanks > > > >> } > >> > >> void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid) > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h > >> b/drivers/vdpa/ifcvf/ifcvf_base.h > >> index d34d3bc0dbf4..7430f80779be 100644 > >> --- a/drivers/vdpa/ifcvf/ifcvf_base.h > >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > >> @@ -82,6 +82,7 @@ struct ifcvf_hw { > >> int vqs_reused_irq; > >> u16 nr_vring; > >> /* VIRTIO_PCI_CAP_DEVICE_CFG size */ > >> + u32 num_msix_vectors; > >> u32 cap_dev_config_size; > >> struct pci_dev *pdev; > >> }; > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c
Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine
On Fri, May 26, 2023 at 1:30 PM Zhu, Lingshan wrote: > > > > On 5/26/2023 11:36 AM, Zhu, Lingshan wrote: > > > > > > On 5/26/2023 9:34 AM, Jason Wang wrote: > >> On Thu, May 25, 2023 at 5:38 PM Zhu, Lingshan > >> wrote: > >>> > >>> > >>> On 5/24/2023 4:03 PM, Jason Wang wrote: > >>>> On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan > >>>> wrote: > >>>>> This commit synchronize irqs of the virtqueues > >>>>> and config space in the reset routine. > >>>>> Thus ifcvf_stop_hw() and reset() are refactored as well. > >>>>> > >>>>> Signed-off-by: Zhu Lingshan > >>>>> --- > >>>>>drivers/vdpa/ifcvf/ifcvf_base.c | 41 + > >>>>>drivers/vdpa/ifcvf/ifcvf_base.h | 1 + > >>>>>drivers/vdpa/ifcvf/ifcvf_main.c | 46 > >>>>> + > >>>>>3 files changed, 38 insertions(+), 50 deletions(-) > >>>>> > >>>>> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c > >>>>> b/drivers/vdpa/ifcvf/ifcvf_base.c > >>>>> index 79e313c5e10e..1f39290baa38 100644 > >>>>> --- a/drivers/vdpa/ifcvf/ifcvf_base.c > >>>>> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > >>>>> @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 > >>>>> status) > >>>>> > >>>>>void ifcvf_reset(struct ifcvf_hw *hw) > >>>>>{ > >>>>> - hw->config_cb.callback = NULL; > >>>>> - hw->config_cb.private = NULL; > >>>>> - > >>>>> ifcvf_set_status(hw, 0); > >>>>> - /* flush set_status, make sure VF is stopped, reset */ > >>>>> - ifcvf_get_status(hw); > >>>>> + while (ifcvf_get_status(hw)) > >>>>> + msleep(1); > >>>>>} > >>>>> > >>>>>u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) > >>>>> @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, > >>>>> u16 qid, bool ready) > >>>>> vp_iowrite16(ready, &cfg->queue_enable); > >>>>>} > >>>>> > >>>>> -static void ifcvf_hw_disable(struct ifcvf_hw *hw) > >>>>> +static void ifcvf_reset_vring(struct ifcvf_hw *hw) > >>>>>{ > >>>>> - u32 i; > >>>>> + u16 qid; > >>>>> + > >>>>> + for (qid = 0; qid < hw->nr_vring; qid++) { > >>>>> + hw->vring[qid].cb.callback = NULL; > >>>>> + hw->vring[qid].cb.private = NULL; > >>>>> + ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR); > >>>>> + } > >>>>> +} > >>>>> > >>>>> +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw) > >>>>> +{ > >>>>> + hw->config_cb.callback = NULL; > >>>>> + hw->config_cb.private = NULL; > >>>>> ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR); > >>>>> - for (i = 0; i < hw->nr_vring; i++) { > >>>>> - ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR); > >>>>> +} > >>>>> + > >>>>> +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw) > >>>>> +{ > >>>>> + u32 nvectors = hw->num_msix_vectors; > >>>>> + struct pci_dev *pdev = hw->pdev; > >>>>> + int i, irq; > >>>>> + > >>>>> + for (i = 0; i < nvectors; i++) { > >>>>> + irq = pci_irq_vector(pdev, i); > >>>>> + if (irq >= 0) > >>>>> + synchronize_irq(irq); > >>>>> } > >>>>>} > >>>>> > >>>>>void ifcvf_stop_hw(struct ifcvf_hw *hw) > >>>>>{ > >>>>> - ifcvf_hw_disable(hw); > >>>>> - ifcvf_reset(hw); > >>>>> + ifcvf_synchronize_irq(hw); > >>>>> + ifcvf_reset_vring(hw); > >>&
[PATCH] virtio_ring: validate used buffer length
This patch validate the used buffer length provided by the device before trying to use it. This is done by remembering the in buffer length in a dedicated array during virtqueue_add(), then we can fail the virtqueue_get_buf() when we find the device is trying to give us a used buffer length which is greater than we stored before. This validation is disable by default via module parameter to unbreak some existing devices since some legacy devices are known to report buggy used length. Signed-off-by: Jason Wang --- Changes since V4: - drop the flat for driver to suppress the check - validation is disabled by default - don't do validation for legacy device - rebase and support virtqueue resize --- drivers/virtio/virtio_ring.c | 75 1 file changed, 75 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 143f380baa1c..5b151605aaf8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -15,6 +15,9 @@ #include #include +static bool force_used_validation = false; +module_param(force_used_validation, bool, 0444); + #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...)\ @@ -105,6 +108,9 @@ struct vring_virtqueue_split { struct vring_desc_state_split *desc_state; struct vring_desc_extra *desc_extra; + /* Maximum in buffer length, NULL means no used validation */ + u32 *buflen; + /* DMA address and size information */ dma_addr_t queue_dma_addr; size_t queue_size_in_bytes; @@ -145,6 +151,9 @@ struct vring_virtqueue_packed { struct vring_desc_state_packed *desc_state; struct vring_desc_extra *desc_extra; + /* Maximum in buffer length, NULL means no used validation */ + u32 *buflen; + /* DMA address and size information */ dma_addr_t ring_dma_addr; dma_addr_t driver_event_dma_addr; @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; + u32 buflen = 0; START_USE(vq); @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, indirect); + buflen += sg->length; } } /* Last one doesn't continue. */ @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, else vq->split.desc_state[head].indir_desc = ctx; + /* Store in buffer length if necessary */ + if (vq->split.buflen) + vq->split.buflen[head] = buflen; + /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); @@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } + if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) { + BAD_RING(vq, "used len %d is larger than max in buffer len %u\n", + *len, vq->split.buflen[i]); + return NULL; + } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; @@ -1085,10 +1105,25 @@ static void vring_free_split(struct vring_virtqueue_split *vring_split, vring_split->queue_dma_addr, dma_dev); + kfree(vring_split->buflen); kfree(vring_split->desc_state); kfree(vring_split->desc_extra); } +static bool vring_needs_used_validation(const struct virtio_device *vdev) +{ + /* +* Several legacy devices are known to produce buggy used +* length. In order to let driver work, we won't validate used +* buffer length in this case. +*/ + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return false; + if (force_used_validation) + return true; + return false; +} + static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, u32 num, @@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, vring_split->vring_align = vring_align; vring_split->may_reduce_num = may_reduce_num; + if (vring_needs_used_validation(vdev)) { +
Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
On Fri, May 26, 2023 at 1:46 PM Liang Chen wrote: > > "private" of buffer page is currently used for big mode to chain pages. > But in mergeable mode, that offset of page could mean something else, > e.g. when page_pool page is used instead. So excluding mergeable mode to > avoid such a problem. If this issue happens only in the case of page_pool, it would be better to squash it there. Thanks > > Signed-off-by: Liang Chen > --- > drivers/net/virtio_net.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 5a7f7a76b920..c5dca0d92e64 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info > *vi, > return NULL; > > page = (struct page *)page->private; > - if (page) > + if (!vi->mergeable_rx_bufs && page) > give_pages(rq, page); > goto ok; > } > -- > 2.31.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
On Fri, May 26, 2023 at 1:46 PM Liang Chen wrote: > > The implementation at the moment uses one page per packet in both the > normal and XDP path. It's better to explain why we need a page pool and how it can help the performance. > In addition, introducing a module parameter to enable > or disable the usage of page pool (disabled by default). If page pool wins for most of the cases, any reason to disable it by default? > > In single-core vm testing environments, it gives a modest performance gain > in the normal path. > Upstream codebase: 47.5 Gbits/sec > Upstream codebase + page_pool support: 50.2 Gbits/sec > > In multi-core vm testing environments, The most significant performance > gain is observed in XDP cpumap: > Upstream codebase: 1.38 Gbits/sec > Upstream codebase + page_pool support: 9.74 Gbits/sec Please show more details on the test. E.g which kinds of tests have you measured? Btw, it would be better to measure PPS as well. > > With this foundation, we can further integrate page pool fragmentation and > DMA map/unmap support. > > Signed-off-by: Liang Chen > --- > drivers/net/virtio_net.c | 188 ++- I believe we should make virtio-net to select CONFIG_PAGE_POOL or do the ifdef tricks at least. > 1 file changed, 146 insertions(+), 42 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index c5dca0d92e64..99c0ca0c1781 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444); > module_param(gso, bool, 0444); > module_param(napi_tx, bool, 0644); > > +static bool page_pool_enabled; > +module_param(page_pool_enabled, bool, 0400); > + > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > #define GOOD_COPY_LEN 128 > @@ -159,6 +162,9 @@ struct receive_queue { > /* Chain pages by the private ptr. */ > struct page *pages; > > + /* Page pool */ > + struct page_pool *page_pool; > + > /* Average packet length for mergeable receive buffers. */ > struct ewma_pkt_len mrg_avg_pkt_len; > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, > unsigned int buflen, > return skb; > } > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page) > +{ > + if (rq->page_pool) > + page_pool_put_full_page(rq->page_pool, page, true); > + else > + put_page(page); > +} > + > /* Called from bottom half context */ > static struct sk_buff *page_to_skb(struct virtnet_info *vi, >struct receive_queue *rq, > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info > *vi, > hdr = skb_vnet_hdr(skb); > memcpy(hdr, hdr_p, hdr_len); > if (page_to_free) > - put_page(page_to_free); > + virtnet_put_page(rq, page_to_free); > > return skb; > } > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, > return ret; > } > > -static void put_xdp_frags(struct xdp_buff *xdp) > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq) > { rq could be fetched from xdp_rxq_info? > struct skb_shared_info *shinfo; > struct page *xdp_page; > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp) > shinfo = xdp_get_shared_info_from_buff(xdp); > for (i = 0; i < shinfo->nr_frags; i++) { > xdp_page = skb_frag_page(&shinfo->frags[i]); > - put_page(xdp_page); > + virtnet_put_page(rq, xdp_page); > } > } > } > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct > receive_queue *rq, > if (page_off + *len + tailroom > PAGE_SIZE) > return NULL; > > - page = alloc_page(GFP_ATOMIC); > + if (rq->page_pool) > + page = page_pool_dev_alloc_pages(rq->page_pool); > + else > + page = alloc_page(GFP_ATOMIC); > + > if (!page) > return NULL; > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct > receive_queue *rq, > * is sending packet larger than the MTU. > */ > if ((page_off + buflen + tailroom) > PAGE_SIZE) { > - put_page(p); > + virtnet_put_page(rq, p); > goto err_buf; > } > > memcpy(page_address(page) + page_off, >page_address(p) + off, buflen); > page_off += buflen; > - put_page(p); > + virtnet_put_page(rq, p); > } > > /* Headroom does not contribute to packet length */ > *len = page_off - VIRTIO_XDP_HEADROOM; > return page; > err_buf: > - __free_pages
Re: [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler
On Fri, May 26, 2023 at 1:47 PM Liang Chen wrote: > > Currently, DMA operations of virtio devices' data buffer are encapsulated > within the underlying virtqueue implementation. DMA map/unmap operations > are performed for each data buffer attached to/detached from the virtqueue, > which is transparent and invisible to the higher-level virtio device > drivers. This encapsulation makes it not viable for device drivers to > introduce certain mechanisms, such as page pool, that require explicit > management of DMA map/unmap. Therefore, by inserting a pre-handler before > the generic DMA map/unmap operations, virtio device drivers have the > opportunity to participate in DMA operations. > > Signed-off-by: Liang Chen So Xuan is doing AF_XDP for the virtio-net that allows the DMA to be mapped at least by the virtio-net. It looks like a way to allow virtio-net to map and unmap the DMA buffer by itself, but this patch goes into another way which seems to query the address from the virtio core. Personally, I think map and sync by the virtio-net driver seems clean. But we can see. Thanks ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
On Fri, May 26, 2023 at 1:47 PM Liang Chen wrote: > > Adding a DMA pre-handler that utilizes page pool for managing DMA mappings. > When IOMMU is enabled, turning on the page_pool_dma_map module parameter to > select page pool for DMA mapping management gives a significant reduction > in the overhead caused by DMA mappings. > > In testing environments with a single core vm and qemu emulated IOMMU, > significant performance improvements can be observed: > Upstream codebase: 1.76 Gbits/sec > Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec > Upstream codebase with page pool fragmentation and DMA support: 19.3 > Gbits/sec > > Signed-off-by: Liang Chen > --- > drivers/net/virtio_net.c | 55 > 1 file changed, 55 insertions(+) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index ac40b8c66c59..73cc4f9fe4fa 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -22,6 +22,7 @@ > #include > #include > #include > +#include > > static int napi_weight = NAPI_POLL_WEIGHT; > module_param(napi_weight, int, 0444); > @@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644); > > static bool page_pool_enabled; > static bool page_pool_frag; > +static bool page_pool_dma_map; > module_param(page_pool_enabled, bool, 0400); > module_param(page_pool_frag, bool, 0400); > +module_param(page_pool_dma_map, bool, 0400); > > /* FIXME: MTU in config. */ > #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) > @@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi) > virtnet_free_queues(vi); > } > > +static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page > *page, > + unsigned long offset, size_t size, > + enum dma_data_direction dir, > unsigned long attrs) > +{ > + struct page *head_page; > + > + if (dir != DMA_FROM_DEVICE) > + return 0; > + > + head_page = compound_head(page); > + return page_pool_get_dma_addr(head_page) > + + (page - head_page) * PAGE_SIZE > + + offset; So it's not a map, it is just a query from the dma address from the pool. > +} > + > +static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t > dma_handle, > + size_t size, enum dma_data_direction > dir, > + unsigned long attrs) > +{ > + phys_addr_t phys; > + > + /* Handle only the RX direction, and sync the DMA memory only if it's > not > +* a DMA coherent architecture. > +*/ > + if (dir != DMA_FROM_DEVICE) > + return false; > + > + if (dev_is_dma_coherent(dev)) > + return true; > + > + phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); This would be somehow slow. If we track the mapping by driver, it would be much faster. More could be seen here: https://lists.linuxfoundation.org/pipermail/virtualization/2023-May/066778.html Thanks > + if (WARN_ON(!phys)) > + return false; > + > + arch_sync_dma_for_cpu(phys, size, dir); > + return true; > +} > + > +static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = { > + .map_page = virtnet_pp_dma_map_page, > + .unmap_page = virtnet_pp_dma_unmap_page, > +}; > + > static void virtnet_alloc_page_pool(struct receive_queue *rq) > { > struct virtio_device *vdev = rq->vq->vdev; > @@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct > receive_queue *rq) > if (page_pool_frag) > pp_params.flags |= PP_FLAG_PAGE_FRAG; > > + /* Consider using page pool DMA support only when DMA API is used. */ > + if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) && > + page_pool_dma_map) { > + pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; > + pp_params.dma_dir = DMA_FROM_DEVICE; > + pp_params.max_len = PAGE_SIZE << pp_params.order; > + virtqueue_register_pre_dma_ops(rq->vq, > &virtnet_pp_pre_dma_ops); > + } > + > rq->page_pool = page_pool_create(&pp_params); > if (IS_ERR(rq->page_pool)) { > dev_warn(&vdev->dev, "page pool creation failed: %ld\n", > -- > 2.31.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio_ring: validate used buffer length
On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin wrote: > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > This patch validate > > validates > > > the used buffer length provided by the device > > before trying to use it. > > before returning it to caller > > > This is done by remembering the in buffer > > length in a dedicated array during virtqueue_add(), then we can fail > > the virtqueue_get_buf() when we find the device is trying to give us a > > used buffer length which is greater than we stored before. > > than what we stored > > > > > This validation is disable > > disabled > > > by default via module parameter to unbreak > > some existing devices since some legacy devices are known to report > > buggy used length. > > > > Signed-off-by: Jason Wang > > First I'm not merging this without more data about > what is known to be broken and what is known to work well > in the commit log. And how exactly do things work if used length > is wrong? Assuming the device is malicious, it would be very hard to answer. Auditing and fuzzing won't cover every case. Instead of trying to seek the answer, we can simply make sure the used in buffer length is validated then we know we're fine or not. > Second what's wrong with dma_desc_extra that we already maintain? > Third motivation - it's part and parcel of the hardening effort yes? They are different. dma_desc_extra is for a descriptor ring, but this is for a used ring. Technically we can go back to iterate on the descriptor ring for a legal used in buffer length. But it will have worse performance. > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before > we do more hardening. If it's irrevocably broken let's rip it out? So the plan is 1) finish used ring validation (this had been proposed, merged and reverted before notification hardening) 2) do notification hardening on top. So let's leave it as is and I will do a rework after we finalize the used ring validation. Thanks > > > > --- > > Changes since V4: > > - drop the flat for driver to suppress the check > > - validation is disabled by default > > - don't do validation for legacy device > > - rebase and support virtqueue resize > > --- > > drivers/virtio/virtio_ring.c | 75 > > 1 file changed, 75 insertions(+) > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > index 143f380baa1c..5b151605aaf8 100644 > > --- a/drivers/virtio/virtio_ring.c > > +++ b/drivers/virtio/virtio_ring.c > > @@ -15,6 +15,9 @@ > > #include > > #include > > > > +static bool force_used_validation = false; > > +module_param(force_used_validation, bool, 0444); > > + > > #ifdef DEBUG > > /* For development, we want to crash whenever the ring is screwed. */ > > #define BAD_RING(_vq, fmt, args...) \ > > @@ -105,6 +108,9 @@ struct vring_virtqueue_split { > > struct vring_desc_state_split *desc_state; > > struct vring_desc_extra *desc_extra; > > > > + /* Maximum in buffer length, NULL means no used validation */ > > + u32 *buflen; > > + > > /* DMA address and size information */ > > dma_addr_t queue_dma_addr; > > size_t queue_size_in_bytes; > > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed { > > struct vring_desc_state_packed *desc_state; > > struct vring_desc_extra *desc_extra; > > > > + /* Maximum in buffer length, NULL means no used validation */ > > + u32 *buflen; > > + > > /* DMA address and size information */ > > dma_addr_t ring_dma_addr; > > dma_addr_t driver_event_dma_addr; > > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue > > *_vq, > > unsigned int i, n, avail, descs_used, prev, err_idx; > > int head; > > bool indirect; > > + u32 buflen = 0; > > > > START_USE(vq); > > > > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue > > *_vq, > >VRING_DESC_F_NEXT | > >VRING_DESC_F_WRITE, > >indirect); > > + buflen += sg->length; > > } > > } > > /* Last one doesn't continue. */ > > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue > >
Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue
On Sun, May 28, 2023 at 7:39 PM Michael S. Tsirkin wrote: > > On Fri, May 26, 2023 at 09:31:34AM +0800, Jason Wang wrote: > > On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin wrote: > > > > > > On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote: > > > > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote: > > > > > > This patch convert rx mode setting to be done in a workqueue, this > > > > > > is > > > > > > a must for allow to sleep when waiting for the cvq command to > > > > > > response since current code is executed under addr spin lock. > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > --- > > > > > > Changes since V1: > > > > > > - use RTNL to synchronize rx mode worker > > > > > > --- > > > > > > drivers/net/virtio_net.c | 55 > > > > > > +--- > > > > > > 1 file changed, 52 insertions(+), 3 deletions(-) > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > index 56ca1d270304..5d2f1da4eaa0 100644 > > > > > > --- a/drivers/net/virtio_net.c > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > @@ -265,6 +265,12 @@ struct virtnet_info { > > > > > > /* Work struct for config space updates */ > > > > > > struct work_struct config_work; > > > > > > > > > > > > + /* Work struct for config rx mode */ > > > > > > > > > > With a bit less abbreviation maybe? setting rx mode? > > > > > > > > That's fine. > > > > > > > > > > > > > > > + struct work_struct rx_mode_work; > > > > > > + > > > > > > + /* Is rx mode work enabled? */ > > > > > > > > > > Ugh not a great comment. > > > > > > > > Any suggestions for this. E.g we had: > > > > > > > > /* Is delayed refill enabled? */ > > > > > > /* OK to queue work setting RX mode? */ > > > > Ok. > > > > > > > > > > > > > > > > > > > + bool rx_mode_work_enabled; > > > > > > + > > > > > > > > > > > > > > > > > > > > > /* Does the affinity hint is set for virtqueues? */ > > > > > > bool affinity_hint_set; > > > > > > > > > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct > > > > > > virtnet_info *vi) > > > > > > spin_unlock_bh(&vi->refill_lock); > > > > > > } > > > > > > > > > > > > +static void enable_rx_mode_work(struct virtnet_info *vi) > > > > > > +{ > > > > > > + rtnl_lock(); > > > > > > + vi->rx_mode_work_enabled = true; > > > > > > + rtnl_unlock(); > > > > > > +} > > > > > > + > > > > > > +static void disable_rx_mode_work(struct virtnet_info *vi) > > > > > > +{ > > > > > > + rtnl_lock(); > > > > > > + vi->rx_mode_work_enabled = false; > > > > > > + rtnl_unlock(); > > > > > > +} > > > > > > + > > > > > > static void virtqueue_napi_schedule(struct napi_struct *napi, > > > > > > struct virtqueue *vq) > > > > > > { > > > > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device > > > > > > *dev) > > > > > > return 0; > > > > > > } > > > > > > > > > > > > -static void virtnet_set_rx_mode(struct net_device *dev) > > > > > > +static void virtnet_rx_mode_work(struct work_struct *work) > > > > > > { > > > > > > - struct virtnet_info *vi = netdev_priv(dev); > > > > > > + struct virtnet_info *vi = > > > > > > + container_of(work, struct vir
Re: [PATCH] virtio_ring: validate used buffer length
On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin wrote: > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote: > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin wrote: > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > > > This patch validate > > > > > > validates > > > > > > > the used buffer length provided by the device > > > > before trying to use it. > > > > > > before returning it to caller > > > > > > > This is done by remembering the in buffer > > > > length in a dedicated array during virtqueue_add(), then we can fail > > > > the virtqueue_get_buf() when we find the device is trying to give us a > > > > used buffer length which is greater than we stored before. > > > > > > than what we stored > > > > > > > > > > > This validation is disable > > > > > > disabled > > > > > > > by default via module parameter to unbreak > > > > some existing devices since some legacy devices are known to report > > > > buggy used length. > > > > > > > > Signed-off-by: Jason Wang > > > > > > First I'm not merging this without more data about > > > what is known to be broken and what is known to work well > > > in the commit log. And how exactly do things work if used length > > > is wrong? > > > > Assuming the device is malicious, it would be very hard to answer. > > Auditing and fuzzing won't cover every case. Instead of trying to seek > > the answer, we can simply make sure the used in buffer length is > > validated then we know we're fine or not. > > To restate the question, you said above "some legacy devices are known > to report buggy used length". If they report buggy length then how > can things work? The validation is disabled for legacy device (as stated in the changelog): static bool vring_needs_used_validation(const struct virtio_device *vdev) { /* * Several legacy devices are known to produce buggy used * length. In order to let driver work, we won't validate used * buffer length in this case. */ if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) return false; if (force_used_validation) return true; return false; } This seems to be what we've agreed in last version: https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56 Thanks > > > > Second what's wrong with dma_desc_extra that we already maintain? > > > Third motivation - it's part and parcel of the hardening effort yes? > > > > They are different. dma_desc_extra is for a descriptor ring, but this > > is for a used ring. Technically we can go back to iterate on the > > descriptor ring for a legal used in buffer length. But it will have > > worse performance. > > I don't really understand. We already iterate when we unmap - > all that is necessary is to subtract it from used length, if at > the end of the process it is >0 then we know used length is too > large. Yes, but it is the job that is done in the driver level not the virtio core. Validation in virtio core is still necessary since they're working at different levels and it's hard to force the validation in all drivers by codes. Last version introduces a suppress_driver_validation to allow the driver to suppress the core validation which seems not good, we need a way to force the virtio_ring code to do validation before. Or such stuff could be added on top since the validation is by default anyway. Thanks > > > > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before > > > we do more hardening. If it's irrevocably broken let's rip it out? > > > > So the plan is > > > > 1) finish used ring validation (this had been proposed, merged and > > reverted before notification hardening) > > 2) do notification hardening on top. > > > > So let's leave it as is and I will do a rework after we finalize the > > used ring validation. > > > > Thanks > > > > > > > > > > > > --- > > > > Changes since V4: > > > > - drop the flat for driver to suppress the check > > > > - validation is disabled by default > > > > - don't do validation for legacy device > > > > - rebase and support virtqueue resize > > > > --- > > > >
Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue
On Mon, May 29, 2023 at 9:21 AM Jason Wang wrote: > > On Sun, May 28, 2023 at 7:39 PM Michael S. Tsirkin wrote: > > > > On Fri, May 26, 2023 at 09:31:34AM +0800, Jason Wang wrote: > > > On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin > > > wrote: > > > > > > > > On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote: > > > > > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin > > > > > wrote: > > > > > > > > > > > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote: > > > > > > > This patch convert rx mode setting to be done in a workqueue, > > > > > > > this is > > > > > > > a must for allow to sleep when waiting for the cvq command to > > > > > > > response since current code is executed under addr spin lock. > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > --- > > > > > > > Changes since V1: > > > > > > > - use RTNL to synchronize rx mode worker > > > > > > > --- > > > > > > > drivers/net/virtio_net.c | 55 > > > > > > > +--- > > > > > > > 1 file changed, 52 insertions(+), 3 deletions(-) > > > > > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > > > index 56ca1d270304..5d2f1da4eaa0 100644 > > > > > > > --- a/drivers/net/virtio_net.c > > > > > > > +++ b/drivers/net/virtio_net.c > > > > > > > @@ -265,6 +265,12 @@ struct virtnet_info { > > > > > > > /* Work struct for config space updates */ > > > > > > > struct work_struct config_work; > > > > > > > > > > > > > > + /* Work struct for config rx mode */ > > > > > > > > > > > > With a bit less abbreviation maybe? setting rx mode? > > > > > > > > > > That's fine. > > > > > > > > > > > > > > > > > > + struct work_struct rx_mode_work; > > > > > > > + > > > > > > > + /* Is rx mode work enabled? */ > > > > > > > > > > > > Ugh not a great comment. > > > > > > > > > > Any suggestions for this. E.g we had: > > > > > > > > > > /* Is delayed refill enabled? */ > > > > > > > > /* OK to queue work setting RX mode? */ > > > > > > Ok. > > > > > > > > > > > > > > > > > > > > > > > > + bool rx_mode_work_enabled; > > > > > > > + > > > > > > > > > > > > > > > > > > > > > > > > > /* Does the affinity hint is set for virtqueues? */ > > > > > > > bool affinity_hint_set; > > > > > > > > > > > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct > > > > > > > virtnet_info *vi) > > > > > > > spin_unlock_bh(&vi->refill_lock); > > > > > > > } > > > > > > > > > > > > > > +static void enable_rx_mode_work(struct virtnet_info *vi) > > > > > > > +{ > > > > > > > + rtnl_lock(); > > > > > > > + vi->rx_mode_work_enabled = true; > > > > > > > + rtnl_unlock(); > > > > > > > +} > > > > > > > + > > > > > > > +static void disable_rx_mode_work(struct virtnet_info *vi) > > > > > > > +{ > > > > > > > + rtnl_lock(); > > > > > > > + vi->rx_mode_work_enabled = false; > > > > > > > + rtnl_unlock(); > > > > > > > +} > > > > > > > + > > > > > > > static void virtqueue_napi_schedule(struct napi_struct *napi, > > > > > > > struct virtqueue *vq) > > > > > > > { > > > > > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device > > > > > > > *dev) > > > > > > > return 0; > > > > > > &g
Re: [PATCH v7 virtio 09/11] pds_vdpa: add support for vdpa and vdpamgmt interfaces
On Sat, May 20, 2023 at 5:57 AM Shannon Nelson wrote: > > This is the vDPA device support, where we advertise that we can > support the virtio queues and deal with the configuration work > through the pds_core's adminq. > > Signed-off-by: Shannon Nelson > --- > > Note: this had previously been Acked-by Jason Wang, but changed enough > in v6 that I felt it needs a new Ack. Acked-by: Jason Wang Thanks > > drivers/vdpa/pds/aux_drv.c | 15 + > drivers/vdpa/pds/aux_drv.h | 1 + > drivers/vdpa/pds/debugfs.c | 263 > drivers/vdpa/pds/debugfs.h | 5 + > drivers/vdpa/pds/vdpa_dev.c | 606 +++- > drivers/vdpa/pds/vdpa_dev.h | 4 +- > 6 files changed, 892 insertions(+), 2 deletions(-) > > diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c > index 0c4a135b1484..186e9ee22eb1 100644 > --- a/drivers/vdpa/pds/aux_drv.c > +++ b/drivers/vdpa/pds/aux_drv.c > @@ -63,8 +63,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev, > goto err_free_mgmt_info; > } > > + /* Let vdpa know that we can provide devices */ > + err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev); > + if (err) { > + dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: > %pe\n", > + __func__, ERR_PTR(err)); > + goto err_free_virtio; > + } > + > + pds_vdpa_debugfs_add_pcidev(vdpa_aux); > + pds_vdpa_debugfs_add_ident(vdpa_aux); > + > return 0; > > +err_free_virtio: > + vp_modern_remove(&vdpa_aux->vd_mdev); > err_free_mgmt_info: > pci_free_irq_vectors(padev->vf_pdev); > err_free_mem: > @@ -79,9 +92,11 @@ static void pds_vdpa_remove(struct auxiliary_device > *aux_dev) > struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev); > struct device *dev = &aux_dev->dev; > > + vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev); > vp_modern_remove(&vdpa_aux->vd_mdev); > pci_free_irq_vectors(vdpa_aux->padev->vf_pdev); > > + pds_vdpa_debugfs_del_vdpadev(vdpa_aux); > kfree(vdpa_aux); > auxiliary_set_drvdata(aux_dev, NULL); > > diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h > index 99e0ff340bfa..26b75344156e 100644 > --- a/drivers/vdpa/pds/aux_drv.h > +++ b/drivers/vdpa/pds/aux_drv.h > @@ -13,6 +13,7 @@ struct pds_vdpa_aux { > struct pds_auxiliary_dev *padev; > > struct vdpa_mgmt_dev vdpa_mdev; > + struct pds_vdpa_device *pdsv; > > struct pds_vdpa_ident ident; > > diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c > index d91dceb07380..21a0dc0cb607 100644 > --- a/drivers/vdpa/pds/debugfs.c > +++ b/drivers/vdpa/pds/debugfs.c > @@ -10,6 +10,7 @@ > #include > > #include "aux_drv.h" > +#include "vdpa_dev.h" > #include "debugfs.h" > > static struct dentry *dbfs_dir; > @@ -24,3 +25,265 @@ void pds_vdpa_debugfs_destroy(void) > debugfs_remove_recursive(dbfs_dir); > dbfs_dir = NULL; > } > + > +#define PRINT_SBIT_NAME(__seq, __f, __name) \ > + do {\ > + if ((__f) & (__name)) \ > + seq_printf(__seq, " %s", &#__name[16]); \ > + } while (0) > + > +static void print_status_bits(struct seq_file *seq, u8 status) > +{ > + seq_puts(seq, "status:"); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET); > + PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED); > + seq_puts(seq, "\n"); > +} > + > +static void print_feature_bits_all(struct seq_file *seq, u64 features) > +{ > + int i; > + > + seq_puts(seq, "features:"); > + > + for (i = 0; i < (sizeof(u64) * 8); i++) { > + u64 mask = BIT_ULL(i); > + > + switch (features & mask) { > + case BIT_ULL(VIRTIO_NET_F_CSUM): > + seq_puts(seq, " VIRTIO_NET_F_CSUM"); > + break; > + case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM): > + seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM"); >
Re: [PATCH] vduse: avoid empty string for dev name
On Tue, May 30, 2023 at 11:37 AM Sheng Zhao wrote: > > Syzkaller hits a kernel WARN when the first character of the dev name > provided is NULL. Solution is to add a NULL check before calling > cdev_device_add() in vduse_create_dev(). > > kobject: (72042169): attempted to be registered with empty name! > WARNING: CPU: 0 PID: 112695 at lib/kobject.c:236 > Call Trace: > kobject_add_varg linux/src/lib/kobject.c:390 [inline] > kobject_add+0xf6/0x150 linux/src/lib/kobject.c:442 > device_add+0x28f/0xc20 linux/src/drivers/base/core.c:2167 > cdev_device_add+0x83/0xc0 linux/src/fs/char_dev.c:546 > vduse_create_dev linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2254 [inline] > vduse_ioctl+0x7b5/0xf30 linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2316 > vfs_ioctl linux/src/fs/ioctl.c:47 [inline] > file_ioctl linux/src/fs/ioctl.c:510 [inline] > do_vfs_ioctl+0x14b/0xa80 linux/src/fs/ioctl.c:697 > ksys_ioctl+0x7c/0xa0 linux/src/fs/ioctl.c:714 > __do_sys_ioctl linux/src/fs/ioctl.c:721 [inline] > __se_sys_ioctl linux/src/fs/ioctl.c:719 [inline] > __x64_sys_ioctl+0x42/0x50 linux/src/fs/ioctl.c:719 > do_syscall_64+0x94/0x330 linux/src/arch/x86/entry/common.c:291 > entry_SYSCALL_64_after_hwframe+0x44/0xa9 > > Reported-by: Xianjun Zeng > Signed-off-by: Sheng Zhao Acked-by: Jason Wang Thanks > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c > b/drivers/vdpa/vdpa_user/vduse_dev.c > index de97e38c3b82..5f5c21674fdc 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -1685,6 +1685,9 @@ static bool vduse_validate_config(struct > vduse_dev_config *config) > if (config->vq_num > 0x) > return false; > > + if (!config->name[0]) > + return false; > + > if (!device_is_allowed(config->device_id)) > return false; > > -- > 2.20.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
在 2023/5/23 23:57, Eric W. Biederman 写道: Oleg Nesterov writes: On 05/22, Oleg Nesterov wrote: Right now I think that "int dead" should die, No, probably we shouldn't call get_signal() if we have already dequeued SIGKILL. Very much agreed. It is one thing to add a patch to move do_exit out of get_signal. It is another to keep calling get_signal after that. Nothing tests that case, and so we get some weird behaviors. but let me think tomorrow. May be something like this... I don't like it but I can't suggest anything better right now. bool killed = false; for (;;) { ... node = llist_del_all(&worker->work_list); if (!node) { schedule(); /* * When we get a SIGKILL our release function will * be called. That will stop new IOs from being queued * and check for outstanding cmd responses. It will then * call vhost_task_stop to tell us to return and exit. */ if (signal_pending(current)) { struct ksignal ksig; if (!killed) killed = get_signal(&ksig); clear_thread_flag(TIF_SIGPENDING); } continue; } I want to point out that we need to consider not just SIGKILL, but SIGABRT that causes a coredump, as well as the process peforming an ordinary exit(2). All of which will cause get_signal to return SIGKILL in this context. --- But let me ask a couple of questions. I share most of these questions. Let's forget this patch, let's look at the current code: node = llist_del_all(&worker->work_list); if (!node) schedule(); node = llist_reverse_order(node); ... process works ... To me this looks a bit confusing. Shouldn't we do if (!node) { schedule(); continue; } just to make the code a bit more clear? If node == NULL then llist_reverse_order() and llist_for_each_entry_safe() will do nothing. But this is minor. /* make sure flag is seen after deletion */ smp_wmb(); llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED, vhost_work_queue() can add this work again and change work->node->next. That is why we use _safe, but we need to ensure that llist_for_each_safe() completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared. So it seems that smp_wmb() can't help and should be removed, instead we need llist_for_each_entry_safe(...) { smp_mb__before_atomic(); clear_bit(VHOST_WORK_QUEUED, &work->flags); Also, if the work->fn pointer is not stable, we should read it before smp_mb__before_atomic() as well. No? __set_current_state(TASK_RUNNING); Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn() can return with current->state != RUNNING ? work->fn(work); Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right before we call work->fn(). Is it "safe" to run this callback with signal_pending() or fatal_signal_pending() ? Finally. I never looked into drivers/vhost/ before so I don't understand this code at all, but let me ask anyway... Can we change vhost_dev_flush() to run the pending callbacks rather than wait for vhost_worker() ? I guess we can't, ->mm won't be correct, but can you confirm? In a conversation long ago I remember hearing that vhost does not support file descriptor passing. Which means all of the file descriptors should be in the same process. It's not. Actually passing vhost fd is pretty common since Qemu is usually running without privilege. So it's the charge of the management layer to open vhost fd and pass it to Qemu. Looking at the vhost code what I am seeing happening is that the vhost_worker persists until vhost_dev_cleanup is called from one of the vhost_???_release() functions. The release functions are only called after the last flush function completes. See __fput if you want to trace the details. On one hand this all seems reasonable. On the other hand I am not seeing the code that prevents file descriptor passing. Yes. It is probably not the worst thing in the world, but what this means is now if you pass a copy of the vhost file descriptor to another process the vhost_worker will persis
Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
在 2023/5/23 20:15, Oleg Nesterov 写道: On 05/22, Oleg Nesterov wrote: Right now I think that "int dead" should die, No, probably we shouldn't call get_signal() if we have already dequeued SIGKILL. but let me think tomorrow. May be something like this... I don't like it but I can't suggest anything better right now. bool killed = false; for (;;) { ... node = llist_del_all(&worker->work_list); if (!node) { schedule(); /* * When we get a SIGKILL our release function will * be called. That will stop new IOs from being queued * and check for outstanding cmd responses. It will then * call vhost_task_stop to tell us to return and exit. */ if (signal_pending(current)) { struct ksignal ksig; if (!killed) killed = get_signal(&ksig); clear_thread_flag(TIF_SIGPENDING); } continue; } --- But let me ask a couple of questions. Let's forget this patch, let's look at the current code: node = llist_del_all(&worker->work_list); if (!node) schedule(); node = llist_reverse_order(node); ... process works ... To me this looks a bit confusing. Shouldn't we do if (!node) { schedule(); continue; } just to make the code a bit more clear? If node == NULL then llist_reverse_order() and llist_for_each_entry_safe() will do nothing. But this is minor. Yes. /* make sure flag is seen after deletion */ smp_wmb(); llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED, vhost_work_queue() can add this work again and change work->node->next. That is why we use _safe, but we need to ensure that llist_for_each_safe() completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared. This should be fine since store is not speculated, so work->node->next needs to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop condition. So it seems that smp_wmb() can't help and should be removed, instead we need llist_for_each_entry_safe(...) { smp_mb__before_atomic(); clear_bit(VHOST_WORK_QUEUED, &work->flags); Also, if the work->fn pointer is not stable, we should read it before smp_mb__before_atomic() as well. The fn won't be changed after it is initialized. No? __set_current_state(TASK_RUNNING); Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn() can return with current->state != RUNNING ? It is because the state were set to TASK_INTERRUPTIBLE in the beginning of the loop otherwise it might be side effect while executing work->fn(). work->fn(work); Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right before we call work->fn(). Is it "safe" to run this callback with signal_pending() or fatal_signal_pending() ? It looks safe since: 1) vhost hold refcnt of the mm 2) release will sync with the worker Finally. I never looked into drivers/vhost/ before so I don't understand this code at all, but let me ask anyway... Can we change vhost_dev_flush() to run the pending callbacks rather than wait for vhost_worker() ? I guess we can't, ->mm won't be correct, but can you confirm? Yes. Thanks Oleg. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio_ring: validate used buffer length
On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin wrote: > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote: > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin wrote: > > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote: > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > > > > > This patch validate > > > > > > > > > > validates > > > > > > > > > > > the used buffer length provided by the device > > > > > > before trying to use it. > > > > > > > > > > before returning it to caller > > > > > > > > > > > This is done by remembering the in buffer > > > > > > length in a dedicated array during virtqueue_add(), then we can fail > > > > > > the virtqueue_get_buf() when we find the device is trying to give > > > > > > us a > > > > > > used buffer length which is greater than we stored before. > > > > > > > > > > than what we stored > > > > > > > > > > > > > > > > > This validation is disable > > > > > > > > > > disabled > > > > > > > > > > > by default via module parameter to unbreak > > > > > > some existing devices since some legacy devices are known to report > > > > > > buggy used length. > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > First I'm not merging this without more data about > > > > > what is known to be broken and what is known to work well > > > > > in the commit log. And how exactly do things work if used length > > > > > is wrong? > > > > > > > > Assuming the device is malicious, it would be very hard to answer. > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek > > > > the answer, we can simply make sure the used in buffer length is > > > > validated then we know we're fine or not. > > > > > > To restate the question, you said above "some legacy devices are known > > > to report buggy used length". If they report buggy length then how > > > can things work? > > > > The validation is disabled for legacy device (as stated in the changelog): > > > > static bool vring_needs_used_validation(const struct virtio_device *vdev) > > { > > /* > > * Several legacy devices are known to produce buggy used > > * length. In order to let driver work, we won't validate used > > * buffer length in this case. > > */ > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) > > return false; > > if (force_used_validation) > > return true; > > return false; > > } > > > > This seems to be what we've agreed in last version: > > > > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56 > > > > Thanks > > > > I don't get it. You wrote: > > This validation is disable > by default via module parameter to unbreak > some existing devices since some legacy devices are known to report > buggy used length. > > which devices? legacy rpmsg and vsock device (before 49d8c5ffad07) at least. > why do you need a module parameter? If we enable it unconditionally for modern devices, it may break some buggy moden device (vsock without a fix as an example). > > > > > > > > > > Second what's wrong with dma_desc_extra that we already maintain? > > > > > Third motivation - it's part and parcel of the hardening effort yes? > > > > > > > > They are different. dma_desc_extra is for a descriptor ring, but this > > > > is for a used ring. Technically we can go back to iterate on the > > > > descriptor ring for a legal used in buffer length. But it will have > > > > worse performance. > > > > > > I don't really understand. We already iterate when we unmap - > > > all that is necessary is to subtract it from used length, if at > > > the end of the process it is >0 then we know used leng
Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
On Wed, May 31, 2023 at 3:25 PM Oleg Nesterov wrote: > > On 05/31, Jason Wang wrote: > > > > 在 2023/5/23 20:15, Oleg Nesterov 写道: > > > > > > /* make sure flag is seen after deletion */ > > > smp_wmb(); > > > llist_for_each_entry_safe(work, work_next, node, node) { > > > clear_bit(VHOST_WORK_QUEUED, &work->flags); > > > > > >I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED, > > >vhost_work_queue() can add this work again and change work->node->next. > > > > > >That is why we use _safe, but we need to ensure that llist_for_each_safe() > > >completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared. > > > > This should be fine since store is not speculated, so work->node->next needs > > to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop condition. > > I don't understand you. OK, to simplify, suppose we have 2 global vars > > void *PTR = something_non_null; > unsigned long FLAGS = -1ul; > > Now I think this code > > CPU_0 CPU_1 > > void *ptr = PTR;if (!test_and_set_bit(0, FLAGS)) > clear_bit(0, FLAGS);PTR = NULL; > BUG_ON(!ptr); > > is racy and can hit the BUG_ON(!ptr). This seems different to the above case? And you can hit BUG_ON with the following execution sequence: [cpu 0] clear_bit(0, FLAGS); [cpu 1] if (!test_and_set_bit(0, FLAGS)) [cpu 1] PTR = NULL; [cpu 0] BUG_ON(!ptr) In vhost code, there's a condition before the clear_bit() which sits inside llist_for_each_entry_safe(): #define llist_for_each_entry_safe(pos, n, node, member)\ for (pos = llist_entry((node), typeof(*pos), member); \ member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) The clear_bit() is a store which is not speculated, so there's a control dependency, the store can't be executed until the condition expression is evaluated which requires pos->member.next (work->node.next) to be loaded. > > I guess it is fine on x86, but in general you need smp_mb__before_atomic() > before clear_bit(), or clear_bit_unlock(). > > > > __set_current_state(TASK_RUNNING); > > > > > >Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn() > > >can return with current->state != RUNNING ? > > > > It is because the state were set to TASK_INTERRUPTIBLE in the beginning of > > the loop otherwise it might be side effect while executing work->fn(). > > Again, I don't understand you. So let me repeat: can work->fn() return with > current->_state != TASK_RUNNING ? If not (and I'd say it should not), you can > do __set_current_state(TASK_RUNNING) once, before llist_for_each_entry_safe(). > Ok, that should be fine. Thanks > > >Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right > > >before we call work->fn(). Is it "safe" to run this callback with > > >signal_pending() or fatal_signal_pending() ? > > > > It looks safe since: > > > > 1) vhost hold refcnt of the mm > > 2) release will sync with the worker > > Well, that's not what I asked... nevermind, please forget. > > Thanks. > > Oleg. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] virtio_ring: validate used buffer length
On Wed, May 31, 2023 at 3:36 PM Jason Wang wrote: > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin wrote: > > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote: > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin > > > wrote: > > > > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote: > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin > > > > > wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > > > > > > This patch validate > > > > > > > > > > > > validates > > > > > > > > > > > > > the used buffer length provided by the device > > > > > > > before trying to use it. > > > > > > > > > > > > before returning it to caller > > > > > > > > > > > > > This is done by remembering the in buffer > > > > > > > length in a dedicated array during virtqueue_add(), then we can > > > > > > > fail > > > > > > > the virtqueue_get_buf() when we find the device is trying to give > > > > > > > us a > > > > > > > used buffer length which is greater than we stored before. > > > > > > > > > > > > than what we stored > > > > > > > > > > > > > > > > > > > > This validation is disable > > > > > > > > > > > > disabled > > > > > > > > > > > > > by default via module parameter to unbreak > > > > > > > some existing devices since some legacy devices are known to > > > > > > > report > > > > > > > buggy used length. > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > First I'm not merging this without more data about > > > > > > what is known to be broken and what is known to work well > > > > > > in the commit log. And how exactly do things work if used length > > > > > > is wrong? > > > > > > > > > > Assuming the device is malicious, it would be very hard to answer. > > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek > > > > > the answer, we can simply make sure the used in buffer length is > > > > > validated then we know we're fine or not. > > > > > > > > To restate the question, you said above "some legacy devices are known > > > > to report buggy used length". If they report buggy length then how > > > > can things work? > > > > > > The validation is disabled for legacy device (as stated in the changelog): > > > > > > static bool vring_needs_used_validation(const struct virtio_device *vdev) > > > { > > > /* > > > * Several legacy devices are known to produce buggy used > > > * length. In order to let driver work, we won't validate used > > > * buffer length in this case. > > > */ > > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) > > > return false; > > > if (force_used_validation) > > > return true; > > > return false; > > > } > > > > > > This seems to be what we've agreed in last version: > > > > > > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56 > > > > > > Thanks > > > > > > > I don't get it. You wrote: > > > > This validation is disable > > by default via module parameter to unbreak > > some existing devices since some legacy devices are known to report > > buggy used length. > > > > which devices? > > legacy rpmsg and vsock device (before 49d8c5ffad07) at least. > > > why do you need a module parameter? > > If we enable it unconditionally for modern devices, it may break some > buggy moden device (vsock without a fix as an example). > > > > > > > > > > > > > > > Second what's wrong with dma_desc_extra that we already maintain? > > > > > > Third motivation -
Re: [PATCH] virtio_ring: validate used buffer length
On Wed, May 31, 2023 at 5:55 PM Michael S. Tsirkin wrote: > > On Wed, May 31, 2023 at 03:36:51PM +0800, Jason Wang wrote: > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin wrote: > > > > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote: > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote: > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin > > > > > > wrote: > > > > > > > > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > > > > > > > This patch validate > > > > > > > > > > > > > > validates > > > > > > > > > > > > > > > the used buffer length provided by the device > > > > > > > > before trying to use it. > > > > > > > > > > > > > > before returning it to caller > > > > > > > > > > > > > > > This is done by remembering the in buffer > > > > > > > > length in a dedicated array during virtqueue_add(), then we can > > > > > > > > fail > > > > > > > > the virtqueue_get_buf() when we find the device is trying to > > > > > > > > give us a > > > > > > > > used buffer length which is greater than we stored before. > > > > > > > > > > > > > > than what we stored > > > > > > > > > > > > > > > > > > > > > > > This validation is disable > > > > > > > > > > > > > > disabled > > > > > > > > > > > > > > > by default via module parameter to unbreak > > > > > > > > some existing devices since some legacy devices are known to > > > > > > > > report > > > > > > > > buggy used length. > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > > > First I'm not merging this without more data about > > > > > > > what is known to be broken and what is known to work well > > > > > > > in the commit log. And how exactly do things work if used length > > > > > > > is wrong? > > > > > > > > > > > > Assuming the device is malicious, it would be very hard to answer. > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to > > > > > > seek > > > > > > the answer, we can simply make sure the used in buffer length is > > > > > > validated then we know we're fine or not. > > > > > > > > > > To restate the question, you said above "some legacy devices are known > > > > > to report buggy used length". If they report buggy length then how > > > > > can things work? > > > > > > > > The validation is disabled for legacy device (as stated in the > > > > changelog): > > > > > > > > static bool vring_needs_used_validation(const struct virtio_device > > > > *vdev) > > > > { > > > > /* > > > > * Several legacy devices are known to produce buggy used > > > > * length. In order to let driver work, we won't validate used > > > > * buffer length in this case. > > > > */ > > > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) > > > > return false; > > > > if (force_used_validation) > > > > return true; > > > > return false; > > > > } > > > > > > > > This seems to be what we've agreed in last version: > > > > > > > > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56 > > > > > > > > Thanks > > > > > > > > > > I don't get it. You wrote: > > > > > > This validation is disable > > > by default via module parameter to unbreak > > > some existing devices since so
Re: [PATCH] virtio_ring: validate used buffer length
On Wed, May 31, 2023 at 6:25 PM Michael S. Tsirkin wrote: > > On Wed, May 31, 2023 at 04:26:38PM +0800, Jason Wang wrote: > > On Wed, May 31, 2023 at 3:36 PM Jason Wang wrote: > > > > > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin > > > wrote: > > > > > > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote: > > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin > > > > > wrote: > > > > > > > > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote: > > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin > > > > > > > wrote: > > > > > > > > > > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote: > > > > > > > > > This patch validate > > > > > > > > > > > > > > > > validates > > > > > > > > > > > > > > > > > the used buffer length provided by the device > > > > > > > > > before trying to use it. > > > > > > > > > > > > > > > > before returning it to caller > > > > > > > > > > > > > > > > > This is done by remembering the in buffer > > > > > > > > > length in a dedicated array during virtqueue_add(), then we > > > > > > > > > can fail > > > > > > > > > the virtqueue_get_buf() when we find the device is trying to > > > > > > > > > give us a > > > > > > > > > used buffer length which is greater than we stored before. > > > > > > > > > > > > > > > > than what we stored > > > > > > > > > > > > > > > > > > > > > > > > > > This validation is disable > > > > > > > > > > > > > > > > disabled > > > > > > > > > > > > > > > > > by default via module parameter to unbreak > > > > > > > > > some existing devices since some legacy devices are known to > > > > > > > > > report > > > > > > > > > buggy used length. > > > > > > > > > > > > > > > > > > Signed-off-by: Jason Wang > > > > > > > > > > > > > > > > First I'm not merging this without more data about > > > > > > > > what is known to be broken and what is known to work well > > > > > > > > in the commit log. And how exactly do things work if used length > > > > > > > > is wrong? > > > > > > > > > > > > > > Assuming the device is malicious, it would be very hard to answer. > > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to > > > > > > > seek > > > > > > > the answer, we can simply make sure the used in buffer length is > > > > > > > validated then we know we're fine or not. > > > > > > > > > > > > To restate the question, you said above "some legacy devices are > > > > > > known > > > > > > to report buggy used length". If they report buggy length then how > > > > > > can things work? > > > > > > > > > > The validation is disabled for legacy device (as stated in the > > > > > changelog): > > > > > > > > > > static bool vring_needs_used_validation(const struct virtio_device > > > > > *vdev) > > > > > { > > > > > /* > > > > > * Several legacy devices are known to produce buggy used > > > > > * length. In order to let driver work, we won't validate used > > > > > * buffer length in this case. > > > > > */ > > > > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) > > > > > return false; > > > > > if (force_used_validation) > > > > > return true; > > > > > return false; > > > > > } > > > > > > > > > > This seems to be what we've agreed in last version: >
Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
On Wed, May 31, 2023 at 5:14 PM Oleg Nesterov wrote: > > On 05/31, Jason Wang wrote: > > > > On Wed, May 31, 2023 at 3:25 PM Oleg Nesterov wrote: > > > > > > On 05/31, Jason Wang wrote: > > > > > > > > 在 2023/5/23 20:15, Oleg Nesterov 写道: > > > > > > > > > > /* make sure flag is seen after deletion */ > > > > > smp_wmb(); > > > > > llist_for_each_entry_safe(work, work_next, node, node) { > > > > > clear_bit(VHOST_WORK_QUEUED, &work->flags); > > > > > > > > > >I am not sure about smp_wmb + clear_bit. Once we clear > > > > >VHOST_WORK_QUEUED, > > > > >vhost_work_queue() can add this work again and change work->node->next. > > > > > > > > > >That is why we use _safe, but we need to ensure that > > > > >llist_for_each_safe() > > > > >completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared. > > > > > > > > This should be fine since store is not speculated, so work->node->next > > > > needs > > > > to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop > > > > condition. > > > > > > I don't understand you. OK, to simplify, suppose we have 2 global vars > > > > > > void *PTR = something_non_null; > > > unsigned long FLAGS = -1ul; > > > > > > Now I think this code > > > > > > CPU_0 CPU_1 > > > > > > void *ptr = PTR;if (!test_and_set_bit(0, FLAGS)) > > > clear_bit(0, FLAGS);PTR = NULL; > > > BUG_ON(!ptr); > > > > > > is racy and can hit the BUG_ON(!ptr). > > > > This seems different to the above case? > > not sure, > > > And you can hit BUG_ON with > > the following execution sequence: > > > > [cpu 0] clear_bit(0, FLAGS); > > [cpu 1] if (!test_and_set_bit(0, FLAGS)) > > [cpu 1] PTR = NULL; > > [cpu 0] BUG_ON(!ptr) > > I don't understand this part... yes, we can hit this BUG_ON() without mb in > between, this is what I tried to say. I may miss something, but the above is the sequence that is executed by the processor (for each CPU, it's just the program order). So where do you expect to place an mb can help? > > > In vhost code, there's a condition before the clear_bit() which sits > > inside llist_for_each_entry_safe(): > > > > #define llist_for_each_entry_safe(pos, n, node, member) > >\ > > for (pos = llist_entry((node), typeof(*pos), member); > >\ > > member_address_is_nonnull(pos, member) && > >\ > > (n = llist_entry(pos->member.next, typeof(*n), member), > > true); \ > > pos = n) > > > > The clear_bit() is a store which is not speculated, so there's a > > control dependency, the store can't be executed until the condition > > expression is evaluated which requires pos->member.next > > (work->node.next) to be loaded. > > But llist_for_each_entry_safe() doesn't check "n", I mean, it is not that we > have > something like > > n = llist_entry(...); > if (n) > clear_bit(...); > > so I do not see how can we rely on the load-store control dependency. Just to make sure we are on the same page, the condition expression is member_address_is_nonnull(pos, member) && (n = llist_entry(pos->member.next, typeof(*n), member), true) So it's something like: if (work->node && (work_next = work->node->next, true)) clear_bit(&work->flags); So two loads from both work->node and work->node->next, and there's a store which is clear_bit, then it's a load-store control dependencies? Thanks > > Oleg. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression
On Thu, Jun 1, 2023 at 3:43 PM Oleg Nesterov wrote: > > On 06/01, Jason Wang wrote: > > > > On Wed, May 31, 2023 at 5:14 PM Oleg Nesterov wrote: > > > > > > > > I don't understand you. OK, to simplify, suppose we have 2 global vars > > > > > > > > > > void *PTR = something_non_null; > > > > > unsigned long FLAGS = -1ul; > > > > > > > > > > Now I think this code > > > > > > > > > > CPU_0 CPU_1 > > > > > > > > > > void *ptr = PTR;if (!test_and_set_bit(0, > > > > > FLAGS)) > > > > > clear_bit(0, FLAGS);PTR = NULL; > > > > > BUG_ON(!ptr); > > > > > > > > > > is racy and can hit the BUG_ON(!ptr). > > > > > > > > This seems different to the above case? > > > > > > not sure, > > > > > > > And you can hit BUG_ON with > > > > the following execution sequence: > > > > > > > > [cpu 0] clear_bit(0, FLAGS); > > > > [cpu 1] if (!test_and_set_bit(0, FLAGS)) > > > > [cpu 1] PTR = NULL; > > > > [cpu 0] BUG_ON(!ptr) > > > > > > I don't understand this part... yes, we can hit this BUG_ON() without mb > > > in > > > between, this is what I tried to say. > > > > I may miss something, > > Or me... note that CPU_0 loads the global "PTR" into the local "ptr" before > clear_bit. > Since you have mentioned the program order: yes this lacks READ_ONCE() or > barrier(), > but the same is true for the code in vhost_worker(). So I still don't > understand. > > > but the above is the sequence that is executed > > by the processor (for each CPU, it's just the program order). So where > > do you expect to place an mb can help? > > before clear_bit: > > CPU_0 > > void *ptr = PTR; > mb(); // implies compiler barrier as well > clear_bit(0, FLAGS); > BUG_ON(!ptr); > > just in case... mb() in the code above is only for illustration, we can use > smp_mb__before_atomic() + clear_bit(). Or just clear_bit_unlock(), iiuc the > one-way barrier is fine in this case. Ok, but it seems different, in the case of vhost we had a condition above the clear_bit(). > > > > > > In vhost code, there's a condition before the clear_bit() which sits > > > > inside llist_for_each_entry_safe(): > > > > > > > > #define llist_for_each_entry_safe(pos, n, node, member) > > > >\ > > > > for (pos = llist_entry((node), typeof(*pos), member); > > > >\ > > > > member_address_is_nonnull(pos, member) && > > > >\ > > > > (n = llist_entry(pos->member.next, typeof(*n), member), > > > > true); \ > > > > pos = n) > > > > > > > > The clear_bit() is a store which is not speculated, so there's a > > > > control dependency, the store can't be executed until the condition > > > > expression is evaluated which requires pos->member.next > > > > (work->node.next) to be loaded. > > > > > > But llist_for_each_entry_safe() doesn't check "n", I mean, it is not that > > > we have > > > something like > > > > > > n = llist_entry(...); > > > if (n) > > > clear_bit(...); > > > > > > so I do not see how can we rely on the load-store control dependency. > > > > Just to make sure we are on the same page, the condition expression is > > > > member_address_is_nonnull(pos, member) && (n = > > llist_entry(pos->member.next, typeof(*n), member), true) > > > > So it's something like: > > > > if (work->node && (work_next = work->node->next, true)) > > clear_bit(&work->flags); > > > > So two loads from both work->node and work->node->next, and there's a > > store which is clear_bit, then it's a load-store control dependencies? > > I guess you missed the comma expression... Probably not, see below: > Let me rewrite your pseudo-code > above, it is equivalent to > > if (work->node) { > if ((work_next = work->nod
Re: [PATCH] vp_vdpa: Check queue number of vdpa device from add_config
On Fri, Jun 2, 2023 at 3:35 PM Angus Chen wrote: > > When add virtio_pci vdpa device,check the vqs number of device cap > and max_vq_pairs from add_config. > > Signed-off-by: Angus Chen > --- > drivers/vdpa/virtio_pci/vp_vdpa.c | 11 +-- > 1 file changed, 9 insertions(+), 2 deletions(-) > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > b/drivers/vdpa/virtio_pci/vp_vdpa.c > index 281287fae89f..4bf1ab637d32 100644 > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > @@ -478,7 +478,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, > const char *name, > struct device *dev = &pdev->dev; > struct vp_vdpa *vp_vdpa = NULL; > u64 device_features; > - int ret, i; > + int ret, i, queues; > > vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > dev, &vp_vdpa_ops, 1, 1, name, false); > @@ -491,7 +491,14 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, > const char *name, > vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > vp_vdpa->vdpa.dma_dev = &pdev->dev; > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > + queues = vp_modern_get_num_queues(mdev); > + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > + if (add_config->net.max_vq_pairs > queues / 2) > + return -EINVAL; > + queues = min_t(u32, queues, 2 * add_config->net.max_vq_pairs); Looks like you want to mediate the max_vqp here, but what happens: 1) harware have 4 queue paris 2) vp_vdpa cap it into 2 queue pairs 3) guest may still try to enable 4 queue paris For 3), the kernel needs to mediate the control virtqueue which seems not easy. How about simply starting from failing if the provisioned #qp is not equal to the one that hardware has? Thanks > + } > + > + vp_vdpa->queues = queues; > vp_vdpa->mdev = mdev; > > device_features = vp_modern_get_features(mdev); > -- > 2.25.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella wrote: > > On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin wrote: > >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella wrote: > >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin wrote: > >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella wrote: > >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. Tsirkin wrote: > >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano Garzarella wrote: > >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, VHOST_SET_VRING_BASE) > >> > > > > don't support packed virtqueue well yet, so let's filter the > >> > > > > VIRTIO_F_RING_PACKED feature for now in vhost_vdpa_get_features(). > >> > > > > > >> > > > > This way, even if the device supports it, we don't risk it being > >> > > > > negotiated, then the VMM is unable to set the vring state properly. > >> > > > > > >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") > >> > > > > Cc: sta...@vger.kernel.org > >> > > > > Signed-off-by: Stefano Garzarella > >> > > > > --- > >> > > > > > >> > > > > Notes: > >> > > > > This patch should be applied before the "[PATCH v2 0/3] > >> > > > > vhost_vdpa: > >> > > > > better PACKED support" series [1] and backported in stable > >> > > > > branches. > >> > > > > > >> > > > > We can revert it when we are sure that everything is working > >> > > > > with > >> > > > > packed virtqueues. > >> > > > > > >> > > > > Thanks, > >> > > > > Stefano > >> > > > > > >> > > > > [1] > >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ > >> > > > > >> > > > I'm a bit lost here. So why am I merging "better PACKED support" > >> > > > then? > >> > > > >> > > To really support packed virtqueue with vhost-vdpa, at that point we > >> > > would > >> > > also have to revert this patch. > >> > > > >> > > I wasn't sure if you wanted to queue the series for this merge window. > >> > > In that case do you think it is better to send this patch only for > >> > > stable > >> > > branches? > >> > > > Does this patch make them a NOP? > >> > > > >> > > Yep, after applying the "better PACKED support" series and being > >> > > sure that > >> > > the IOCTLs of vhost-vdpa support packed virtqueue, we should revert > >> > > this > >> > > patch. > >> > > > >> > > Let me know if you prefer a different approach. > >> > > > >> > > I'm concerned that QEMU uses vhost-vdpa IOCTLs thinking that the kernel > >> > > interprets them the right way, when it does not. > >> > > > >> > > Thanks, > >> > > Stefano > >> > > > >> > > >> > If this fixes a bug can you add Fixes tags to each of them? Then it's ok > >> > to merge in this window. Probably easier than the elaborate > >> > mask/unmask dance. > >> > >> CCing Shannon (the original author of the "better PACKED support" > >> series). > >> > >> IIUC Shannon is going to send a v3 of that series to fix the > >> documentation, so Shannon can you also add the Fixes tags? > >> > >> Thanks, > >> Stefano > > > >Well this is in my tree already. Just reply with > >Fixes: <> > >to each and I will add these tags. > > I tried, but it is not easy since we added the support for packed > virtqueue in vdpa and vhost incrementally. > > Initially I was thinking of adding the same tag used here: > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") > > Then I discovered that vq_state wasn't there, so I was thinking of > > Fixes: 530a5678bc00 ("vdpa: support packed virtqueue for set/get_vq_state()") > > So we would have to backport quite a few patches into the stable branches. > I don't know if it's worth it... > > I still think it is better to disable packed in the stable branches, > otherwise I have to make a list of all the patches we need. > > Any other ideas? AFAIK, except for vp_vdpa, pds seems to be the first parent that supports packed virtqueue. Users should not notice anything wrong if they don't use packed virtqueue. And the problem of vp_vdpa + packed virtqueue came since the day0 of vp_vdpa. It seems fine to do nothing I guess. Thanks > > Thanks, > Stefano > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net] virtio_net: use control_buf for coalesce params
On Tue, Jun 6, 2023 at 3:59 AM Brett Creeley wrote: > > Commit 699b045a8e43 ("net: virtio_net: notifications coalescing > support") added coalescing command support for virtio_net. However, > the coalesce commands are using buffers on the stack, which is causing > the device to see DMA errors. There should also be a complaint from > check_for_stack() in debug_dma_map_xyz(). Fix this by adding and using > coalesce params from the control_buf struct, which aligns with other > commands. > > Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support") > Reviewed-by: Shannon Nelson > Signed-off-by: Allen Hubbe > Signed-off-by: Brett Creeley > --- > drivers/net/virtio_net.c | 16 ++++---- The patch is needed for -stable I think. Acked-by: Jason Wang Thanks > 1 file changed, 8 insertions(+), 8 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 56ca1d270304..486b5849033d 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -205,6 +205,8 @@ struct control_buf { > __virtio16 vid; > __virtio64 offloads; > struct virtio_net_ctrl_rss rss; > + struct virtio_net_ctrl_coal_tx coal_tx; > + struct virtio_net_ctrl_coal_rx coal_rx; > }; > > struct virtnet_info { > @@ -2934,12 +2936,10 @@ static int virtnet_send_notf_coal_cmds(struct > virtnet_info *vi, >struct ethtool_coalesce *ec) > { > struct scatterlist sgs_tx, sgs_rx; > - struct virtio_net_ctrl_coal_tx coal_tx; > - struct virtio_net_ctrl_coal_rx coal_rx; > > - coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); > - coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); > - sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx)); > + vi->ctrl->coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); > + vi->ctrl->coal_tx.tx_max_packets = > cpu_to_le32(ec->tx_max_coalesced_frames); > + sg_init_one(&sgs_tx, &vi->ctrl->coal_tx, sizeof(vi->ctrl->coal_tx)); > > if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, > VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, > @@ -2950,9 +2950,9 @@ static int virtnet_send_notf_coal_cmds(struct > virtnet_info *vi, > vi->tx_usecs = ec->tx_coalesce_usecs; > vi->tx_max_packets = ec->tx_max_coalesced_frames; > > - coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); > - coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); > - sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx)); > + vi->ctrl->coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); > + vi->ctrl->coal_rx.rx_max_packets = > cpu_to_le32(ec->rx_max_coalesced_frames); > + sg_init_one(&sgs_rx, &vi->ctrl->coal_rx, sizeof(vi->ctrl->coal_rx)); > > if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, > VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, > -- > 2.17.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH net] virtio_net: Prevent napi_weight changes with VIRTIO_NET_F_NOTF_COAL support
On Tue, Jun 6, 2023 at 5:03 AM Brett Creeley wrote: > > Commit 699b045a8e43 ("net: virtio_net: notifications coalescing > support") added support for VIRTIO_NET_F_NOTF_COAL. The get_coalesce > call made changes to report "1" in tx_max_coalesced_frames if > VIRTIO_NET_F_NOTF_COAL is not supported and napi.weight is non-zero. > However, the napi_weight value could still be changed by the > set_coalesce call regardless of whether or not the device supports > VIRTIO_NET_F_NOTF_COAL. > > It seems like the tx_max_coalesced_frames value should not control more > than 1 thing (i.e. napi_weight and the device's tx_max_packets). So, fix > this by only allowing the napi_weight change if VIRTIO_NET_F_NOTF_COAL > is not supported by the virtio device. > > It wasn't clear to me if this was the intended behavior, so that's why > I'm sending this as an RFC patch initially. Based on the feedback, I > will resubmit as an official patch. It seems the current code is fine since: Before tx coalescing, we have two modes for tx interrupt: 1) TX NAPI mode, using NAPI to recycle xmit packets 2) TX no-NAPI mode, depends on the start_xmit() to recycle xmit packets Each has their own use cases. E.g 1) seems to have better buffer interaction with TCP. But 2) seems to behave better if user cares about PPS and it can gives us 2x PPS when using a vhost-user backend. So we leave an option to switch between those two via sq.napi_weight ethtool -C tx-frames-irq 0 // To disable tx interrupts ethtool -C tx-frames-irq 1 // To enable tx interrupts After tx intr coleasing, we want to stick to this API. ethtool -C tx-frames-irq 0 // To disable tx interrupts ethtool -C tx-frames-irq N (N>=1) // To enable tx interrupts Thanks > > Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support") > Signed-off-by: Brett Creeley > --- > drivers/net/virtio_net.c | 24 +--- > 1 file changed, 13 insertions(+), 11 deletions(-) > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > index 486b5849033d..e28387866909 100644 > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -2990,19 +2990,21 @@ static int virtnet_set_coalesce(struct net_device > *dev, > int ret, i, napi_weight; > bool update_napi = false; > > - /* Can't change NAPI weight if the link is up */ > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > - if (napi_weight ^ vi->sq[0].napi.weight) { > - if (dev->flags & IFF_UP) > - return -EBUSY; > - else > - update_napi = true; > - } > - > - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { > ret = virtnet_send_notf_coal_cmds(vi, ec); > - else > + } else { > + /* Can't change NAPI weight if the link is up */ > + napi_weight = ec->tx_max_coalesced_frames ? > + NAPI_POLL_WEIGHT : 0; > + if (napi_weight ^ vi->sq[0].napi.weight) { > + if (dev->flags & IFF_UP) > + return -EBUSY; > + else > + update_napi = true; > + } > + > ret = virtnet_coal_params_supported(ec); > + } > > if (ret) > return ret; > -- > 2.17.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH net] virtio_net: Prevent napi_weight changes with VIRTIO_NET_F_NOTF_COAL support
On Tue, Jun 6, 2023 at 9:57 AM Xuan Zhuo wrote: > > On Mon, 5 Jun 2023 14:02:36 -0700, Brett Creeley > wrote: > > Commit 699b045a8e43 ("net: virtio_net: notifications coalescing > > support") added support for VIRTIO_NET_F_NOTF_COAL. The get_coalesce > > call made changes to report "1" in tx_max_coalesced_frames if > > VIRTIO_NET_F_NOTF_COAL is not supported and napi.weight is non-zero. > > However, the napi_weight value could still be changed by the > > set_coalesce call regardless of whether or not the device supports > > VIRTIO_NET_F_NOTF_COAL. > > > > It seems like the tx_max_coalesced_frames value should not control more > > than 1 thing (i.e. napi_weight and the device's tx_max_packets). So, fix > > this by only allowing the napi_weight change if VIRTIO_NET_F_NOTF_COAL > > is not supported by the virtio device. > > > @Jason I wonder should we keep this function to change the napi weight by the > coalesec command. I think so, explained in another thread. Thanks > > Thanks. > > > > > It wasn't clear to me if this was the intended behavior, so that's why > > I'm sending this as an RFC patch initially. Based on the feedback, I > > will resubmit as an official patch. > > > > Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support") > > Signed-off-by: Brett Creeley > > --- > > drivers/net/virtio_net.c | 24 +--- > > 1 file changed, 13 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > index 486b5849033d..e28387866909 100644 > > --- a/drivers/net/virtio_net.c > > +++ b/drivers/net/virtio_net.c > > @@ -2990,19 +2990,21 @@ static int virtnet_set_coalesce(struct net_device > > *dev, > > int ret, i, napi_weight; > > bool update_napi = false; > > > > - /* Can't change NAPI weight if the link is up */ > > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; > > - if (napi_weight ^ vi->sq[0].napi.weight) { > > - if (dev->flags & IFF_UP) > > - return -EBUSY; > > - else > > - update_napi = true; > > - } > > - > > - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) > > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { > > ret = virtnet_send_notf_coal_cmds(vi, ec); > > - else > > + } else { > > + /* Can't change NAPI weight if the link is up */ > > + napi_weight = ec->tx_max_coalesced_frames ? > > + NAPI_POLL_WEIGHT : 0; > > + if (napi_weight ^ vi->sq[0].napi.weight) { > > + if (dev->flags & IFF_UP) > > + return -EBUSY; > > + else > > + update_napi = true; > > + } > > + > > ret = virtnet_coal_params_supported(ec); > > + } > > > > if (ret) > > return ret; > > -- > > 2.17.1 > > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vp_vdpa: Check queue number of vdpa device from add_config
On Tue, Jun 6, 2023 at 2:19 PM Angus Chen wrote: > > Hi,Jason. > > > -Original Message- > > From: Jason Wang > > Sent: Monday, June 5, 2023 2:54 PM > > To: Angus Chen > > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org; > > linux-ker...@vger.kernel.org > > Subject: Re: [PATCH] vp_vdpa: Check queue number of vdpa device from > > add_config > > > > On Fri, Jun 2, 2023 at 3:35 PM Angus Chen > > wrote: > > > > > > When add virtio_pci vdpa device,check the vqs number of device cap > > > and max_vq_pairs from add_config. > > > > > > Signed-off-by: Angus Chen > > > --- > > > drivers/vdpa/virtio_pci/vp_vdpa.c | 11 +-- > > > 1 file changed, 9 insertions(+), 2 deletions(-) > > > > > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > > b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > index 281287fae89f..4bf1ab637d32 100644 > > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > @@ -478,7 +478,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev > > *v_mdev, const char *name, > > > struct device *dev = &pdev->dev; > > > struct vp_vdpa *vp_vdpa = NULL; > > > u64 device_features; > > > - int ret, i; > > > + int ret, i, queues; > > > > > > vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > > dev, &vp_vdpa_ops, 1, 1, name, > > false); > > > @@ -491,7 +491,14 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev > > *v_mdev, const char *name, > > > vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > > > > > vp_vdpa->vdpa.dma_dev = &pdev->dev; > > > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > > > + queues = vp_modern_get_num_queues(mdev); > > > + if (add_config->mask & > > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > > > + if (add_config->net.max_vq_pairs > queues / 2) > > > + return -EINVAL; > > > + queues = min_t(u32, queues, 2 * > > add_config->net.max_vq_pairs); > > > > Looks like you want to mediate the max_vqp here, but what happens: > > > > 1) harware have 4 queue paris > > 2) vp_vdpa cap it into 2 queue pairs > > 3) guest may still try to enable 4 queue paris > > > Yes,you are right,this situation can occur. > > For 3), the kernel needs to mediate the control virtqueue which seems not > > easy. > > > > How about simply starting from failing if the provisioned #qp is not > > equal to the one that hardware has? > Ok,You mean we just check it in vp_vdpa or check it in all other vdpa net > drivers? vp_vdpa only, since in some other kind of parents, #qps could be provisioned. Thanks > > > > Thanks > > > > > + } > > > + > > > + vp_vdpa->queues = queues; > > > vp_vdpa->mdev = mdev; > > > > > > device_features = vp_modern_get_features(mdev); > > > -- > > > 2.25.1 > > > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin wrote: > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote: > > On Tue, May 30, 2023 at 9:19 AM Liang Chen > > wrote: > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin > > > wrote: > > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote: > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin > > > > > wrote: > > > > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote: > > > > > > > The implementation at the moment uses one page per packet in both > > > > > > > the > > > > > > > normal and XDP path. In addition, introducing a module parameter > > > > > > > to enable > > > > > > > or disable the usage of page pool (disabled by default). > > > > > > > > > > > > > > In single-core vm testing environments, it gives a modest > > > > > > > performance gain > > > > > > > in the normal path. > > > > > > > Upstream codebase: 47.5 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 50.2 Gbits/sec > > > > > > > > > > > > > > In multi-core vm testing environments, The most significant > > > > > > > performance > > > > > > > gain is observed in XDP cpumap: > > > > > > > Upstream codebase: 1.38 Gbits/sec > > > > > > > Upstream codebase + page_pool support: 9.74 Gbits/sec > > > > > > > > > > > > > > With this foundation, we can further integrate page pool > > > > > > > fragmentation and > > > > > > > DMA map/unmap support. > > > > > > > > > > > > > > Signed-off-by: Liang Chen > > > > > > > > > > > > Why off by default? > > > > > > I am guessing it sometimes has performance costs too? > > > > > > > > > > > > > > > > > > What happens if we use page pool for big mode too? > > > > > > The less modes we have the better... > > > > > > > > > > > > > > > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the > > > > > packet size is very small, it reduces the likelihood of skb > > > > > coalescing. But such cases are rare. > > > > > > > > small packets are rare? These workloads are easy to create actually. > > > > Pls try and include benchmark with small packet size. > > > > > > > > > > Sure, Thanks! > > > > Before going ahead and posting v2 patch, I would like to hear more > > advice for the cases of small packets. I have done more performance > > benchmark with small packets since then. Here is a list of iperf > > output, > > > > With PP and PP fragmenting: > > 256K: [ 5] 505.00-510.00 sec 1.34 GBytes 2.31 Gbits/sec0144 > > KBytes > > 1K: [ 5] 30.00-35.00 sec 4.63 GBytes 7.95 Gbits/sec0 > > 223 KBytes > > 2K: [ 5] 65.00-70.00 sec 8.33 GBytes 14.3 Gbits/sec0 > > 324 KBytes > > 4K: [ 5] 30.00-35.00 sec 13.3 GBytes 22.8 Gbits/sec0 > > 1.08 MBytes > > 8K: [ 5] 50.00-55.00 sec 18.9 GBytes 32.4 Gbits/sec0 > > 744 KBytes > > 16K: [ 5] 25.00-30.00 sec 24.6 GBytes 42.3 Gbits/sec0963 > > KBytes > > 32K: [ 5] 45.00-50.00 sec 29.8 GBytes 51.2 Gbits/sec0 1.25 > > MBytes > > 64K: [ 5] 35.00-40.00 sec 34.0 GBytes 58.4 Gbits/sec0 1.70 > > MBytes > > 128K: [ 5] 45.00-50.00 sec 36.7 GBytes 63.1 Gbits/sec0 4.26 > > MBytes > > 256K: [ 5] 30.00-35.00 sec 40.0 GBytes 68.8 Gbits/sec0 3.20 > > MBytes Note that virtio-net driver is lacking things like BQL and others, so it might suffer from buffer bloat for TCP performance. Would you mind to measure with e.g using testpmd on the vhost to see the rx PPS? > > > > Without PP: > > 256: [ 5] 680.00-685.00 sec 1.57 GBytes 2.69 Gbits/sec0359 > > KBytes > > 1K: [ 5] 75.00-80.00 sec 5.47 GBytes 9.40 Gbits/sec0730 > > KBytes > > 2K: [ 5] 65.00-70.00 sec 9.46 GBytes 16.2 Gbits/sec0 1.99 > > MBytes > > 4K: [ 5] 30.00-35.00 sec 14.5 GBytes 25.0 Gbits/sec0 1.20 > > MBytes > > 8K: [ 5] 45.00-50.00 sec 19.9 GBytes 34.1 Gbits/sec0 1.72 > > MBytes > > 16K:[ 5] 5.00-10.00 sec 23.8 GBytes 40.9 Gbits/sec0 2.90 > > MBytes > > 32K:[ 5] 15.00-20.00 sec 28.0 GBytes 48.1 Gbits/sec0 3.03 > > MBytes > > 64K:[ 5] 60.00-65.00 sec 31.8 GBytes 54.6 Gbits/sec0 3.05 > > MBytes > > 128K: [ 5] 45.00-50.00 sec 33.0 GBytes 56.6 Gbits/sec1 3.03 > > MBytes > > 256K: [ 5] 25.00-30.00 sec 34.7 GBytes 59.6 Gbits/sec0 3.11 > > MBytes > > > > > > The major factor contributing to the performance drop is the reduction > > of skb coalescing. Additionally, without the page pool, small packets > > can still benefit from the allocation of 8 continuous pages by > > breaking them down into smaller pieces. This effectively reduces the > > frequency of page allocation from the buddy system. For instance, the > > arrival of 32 1K packets only triggers one alloc_page call. Therefore, > > the benefits of using a page pool are limited in such cases. I wonder if we can imp
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Wed, Jun 7, 2023 at 5:43 PM Michael S. Tsirkin wrote: > > On Wed, Jun 07, 2023 at 10:39:15AM +0200, Stefano Garzarella wrote: > > On Tue, Jun 6, 2023 at 2:58 PM Michael S. Tsirkin wrote: > > > > > > On Tue, Jun 06, 2023 at 09:29:22AM +0800, Jason Wang wrote: > > > > On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella > > > > wrote: > > > > > > > > > > On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin wrote: > > > > > >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella wrote: > > > > > >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin wrote: > > > > > >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella > > > > > >> > wrote: > > > > > >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. Tsirkin > > > > > >> > > wrote: > > > > > >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano Garzarella > > > > > >> > > > wrote: > > > > > >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, > > > > > >> > > > > VHOST_SET_VRING_BASE) > > > > > >> > > > > don't support packed virtqueue well yet, so let's filter > > > > > >> > > > > the > > > > > >> > > > > VIRTIO_F_RING_PACKED feature for now in > > > > > >> > > > > vhost_vdpa_get_features(). > > > > > >> > > > > > > > > > >> > > > > This way, even if the device supports it, we don't risk it > > > > > >> > > > > being > > > > > >> > > > > negotiated, then the VMM is unable to set the vring state > > > > > >> > > > > properly. > > > > > >> > > > > > > > > > >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") > > > > > >> > > > > Cc: sta...@vger.kernel.org > > > > > >> > > > > Signed-off-by: Stefano Garzarella > > > > > >> > > > > --- > > > > > >> > > > > > > > > > >> > > > > Notes: > > > > > >> > > > > This patch should be applied before the "[PATCH v2 > > > > > >> > > > > 0/3] vhost_vdpa: > > > > > >> > > > > better PACKED support" series [1] and backported in > > > > > >> > > > > stable branches. > > > > > >> > > > > > > > > > >> > > > > We can revert it when we are sure that everything is > > > > > >> > > > > working with > > > > > >> > > > > packed virtqueues. > > > > > >> > > > > > > > > > >> > > > > Thanks, > > > > > >> > > > > Stefano > > > > > >> > > > > > > > > > >> > > > > [1] > > > > > >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ > > > > > >> > > > > > > > > >> > > > I'm a bit lost here. So why am I merging "better PACKED > > > > > >> > > > support" then? > > > > > >> > > > > > > > >> > > To really support packed virtqueue with vhost-vdpa, at that > > > > > >> > > point we would > > > > > >> > > also have to revert this patch. > > > > > >> > > > > > > > >> > > I wasn't sure if you wanted to queue the series for this merge > > > > > >> > > window. > > > > > >> > > In that case do you think it is better to send this patch only > > > > > >> > > for stable > > > > > >> > > branches? > > > > > >> > > > Does this patch make them a NOP? > > > > > >> > > > > > > > >> > > Yep, after applying the "better PACKED support" series and > > > > &g
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Thu, Jun 8, 2023 at 2:03 PM Michael S. Tsirkin wrote: > > On Thu, Jun 08, 2023 at 08:42:15AM +0800, Jason Wang wrote: > > On Wed, Jun 7, 2023 at 5:43 PM Michael S. Tsirkin wrote: > > > > > > On Wed, Jun 07, 2023 at 10:39:15AM +0200, Stefano Garzarella wrote: > > > > On Tue, Jun 6, 2023 at 2:58 PM Michael S. Tsirkin > > > > wrote: > > > > > > > > > > On Tue, Jun 06, 2023 at 09:29:22AM +0800, Jason Wang wrote: > > > > > > On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella > > > > > > wrote: > > > > > > > > > > > > > > On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin > > > > > > > wrote: > > > > > > > >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella > > > > > > > >wrote: > > > > > > > >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin > > > > > > > >> wrote: > > > > > > > >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella > > > > > > > >> > wrote: > > > > > > > >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. > > > > > > > >> > > Tsirkin wrote: > > > > > > > >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano > > > > > > > >> > > > Garzarella wrote: > > > > > > > >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, > > > > > > > >> > > > > VHOST_SET_VRING_BASE) > > > > > > > >> > > > > don't support packed virtqueue well yet, so let's > > > > > > > >> > > > > filter the > > > > > > > >> > > > > VIRTIO_F_RING_PACKED feature for now in > > > > > > > >> > > > > vhost_vdpa_get_features(). > > > > > > > >> > > > > > > > > > > > >> > > > > This way, even if the device supports it, we don't > > > > > > > >> > > > > risk it being > > > > > > > >> > > > > negotiated, then the VMM is unable to set the vring > > > > > > > >> > > > > state properly. > > > > > > > >> > > > > > > > > > > > >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based > > > > > > > >> > > > > backend") > > > > > > > >> > > > > Cc: sta...@vger.kernel.org > > > > > > > >> > > > > Signed-off-by: Stefano Garzarella > > > > > > > >> > > > > --- > > > > > > > >> > > > > > > > > > > > >> > > > > Notes: > > > > > > > >> > > > > This patch should be applied before the "[PATCH v2 > > > > > > > >> > > > > 0/3] vhost_vdpa: > > > > > > > >> > > > > better PACKED support" series [1] and backported > > > > > > > >> > > > > in stable branches. > > > > > > > >> > > > > > > > > > > > >> > > > > We can revert it when we are sure that everything > > > > > > > >> > > > > is working with > > > > > > > >> > > > > packed virtqueues. > > > > > > > >> > > > > > > > > > > > >> > > > > Thanks, > > > > > > > >> > > > > Stefano > > > > > > > >> > > > > > > > > > > > >> > > > > [1] > > > > > > > >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ > > > > > > > >> > > > > > > > > > > >> > > > I'm a bit lost here. So why am I merging "better PACKED > > > > > > > >> > > > support" then? > > > > > > > >> > > > > > > > > > >> > > To really support pa
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella wrote: > > On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote: > > [...] > > >> > > > > I have a question though, what if down the road there > >> > > > > is a new feature that needs more changes? It will be > >> > > > > broken too just like PACKED no? > >> > > > > Shouldn't vdpa have an allowlist of features it knows how > >> > > > > to support? > >> > > > > >> > > > It looks like we had it, but we took it out (by the way, we were > >> > > > enabling packed even though we didn't support it): > >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b > >> > > > > >> > > > The only problem I see is that for each new feature we have to modify > >> > > > the kernel. > >> > > > Could we have new features that don't require handling by vhost-vdpa? > >> > > > > >> > > > Thanks, > >> > > > Stefano > >> > > > >> > > Jason what do you say to reverting this? > >> > > >> > I may miss something but I don't see any problem with vDPA core. > >> > > >> > It's the duty of the parents to advertise the features it has. For > >> > example, > >> > > >> > 1) If some kernel version that is packed is not supported via > >> > set_vq_state, parents should not advertise PACKED features in this > >> > case. > >> > 2) If the kernel has support packed set_vq_state(), but it's emulated > >> > cvq doesn't support, parents should not advertise PACKED as well > >> > > >> > If a parent violates the above 2, it looks like a bug of the parents. > >> > > >> > Thanks > >> > >> Yes but what about vhost_vdpa? Talking about that not the core. > > > >Not sure it's a good idea to workaround parent bugs via vhost-vDPA. > > Sorry, I'm getting lost... > We were talking about the fact that vhost-vdpa doesn't handle > SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before > that series [1], no? > > The parents seem okay, but maybe I missed a few things. > > [1] > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ Yes, more below. > > > > >> Should that not have a whitelist of features > >> since it interprets ioctls differently depending on this? > > > >If there's a bug, it might only matter the following setup: > > > >SET_VRING_BASE/GET_VRING_BASE + VDUSE. > > > >This seems to be broken since VDUSE was introduced. If we really want > >to backport something, it could be a fix to filter out PACKED in > >VDUSE? > > mmm it doesn't seem to be a problem in VDUSE, but in vhost-vdpa. > I think VDUSE works fine with packed virtqueue using virtio-vdpa > (I haven't tried), so why should we filter PACKED in VDUSE? I don't think we need any filtering since: PACKED features has been advertised to userspace via uAPI since 6234f80574d7569444d8718355fa2838e92b158b. Once we relax in uAPI, it would be very hard to restrict it again. For the userspace that tries to negotiate PACKED: 1) if it doesn't use SET_VRING_BASE/GET_VRING_BASE, everything works well 2) if it uses SET_VRING_BASE/GET_VRING_BASE. it might fail or break silently If we backport the fixes to -stable, we may break the application at least in the case 1). Thanks > > Thanks, > Stefano > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Thu, Jun 8, 2023 at 5:21 PM Stefano Garzarella wrote: > > On Thu, Jun 08, 2023 at 05:00:00PM +0800, Jason Wang wrote: > >On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella > >wrote: > >> > >> On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote: > >> > >> [...] > >> > >> >> > > > > I have a question though, what if down the road there > >> >> > > > > is a new feature that needs more changes? It will be > >> >> > > > > broken too just like PACKED no? > >> >> > > > > Shouldn't vdpa have an allowlist of features it knows how > >> >> > > > > to support? > >> >> > > > > >> >> > > > It looks like we had it, but we took it out (by the way, we were > >> >> > > > enabling packed even though we didn't support it): > >> >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b > >> >> > > > > >> >> > > > The only problem I see is that for each new feature we have to > >> >> > > > modify > >> >> > > > the kernel. > >> >> > > > Could we have new features that don't require handling by > >> >> > > > vhost-vdpa? > >> >> > > > > >> >> > > > Thanks, > >> >> > > > Stefano > >> >> > > > >> >> > > Jason what do you say to reverting this? > >> >> > > >> >> > I may miss something but I don't see any problem with vDPA core. > >> >> > > >> >> > It's the duty of the parents to advertise the features it has. For > >> >> > example, > >> >> > > >> >> > 1) If some kernel version that is packed is not supported via > >> >> > set_vq_state, parents should not advertise PACKED features in this > >> >> > case. > >> >> > 2) If the kernel has support packed set_vq_state(), but it's emulated > >> >> > cvq doesn't support, parents should not advertise PACKED as well > >> >> > > >> >> > If a parent violates the above 2, it looks like a bug of the parents. > >> >> > > >> >> > Thanks > >> >> > >> >> Yes but what about vhost_vdpa? Talking about that not the core. > >> > > >> >Not sure it's a good idea to workaround parent bugs via vhost-vDPA. > >> > >> Sorry, I'm getting lost... > >> We were talking about the fact that vhost-vdpa doesn't handle > >> SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before > >> that series [1], no? > >> > >> The parents seem okay, but maybe I missed a few things. > >> > >> [1] > >> https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ > > > >Yes, more below. > > > >> > >> > > >> >> Should that not have a whitelist of features > >> >> since it interprets ioctls differently depending on this? > >> > > >> >If there's a bug, it might only matter the following setup: > >> > > >> >SET_VRING_BASE/GET_VRING_BASE + VDUSE. > >> > > >> >This seems to be broken since VDUSE was introduced. If we really want > >> >to backport something, it could be a fix to filter out PACKED in > >> >VDUSE? > >> > >> mmm it doesn't seem to be a problem in VDUSE, but in vhost-vdpa. > >> I think VDUSE works fine with packed virtqueue using virtio-vdpa > >> (I haven't tried), so why should we filter PACKED in VDUSE? > > > >I don't think we need any filtering since: > > > >PACKED features has been advertised to userspace via uAPI since > >6234f80574d7569444d8718355fa2838e92b158b. Once we relax in uAPI, it > >would be very hard to restrict it again. For the userspace that tries > >to negotiate PACKED: > > > >1) if it doesn't use SET_VRING_BASE/GET_VRING_BASE, everything works well > >2) if it uses SET_VRING_BASE/GET_VRING_BASE. it might fail or break silently > > > >If we backport the fixes to -stable, we may break the application at > >least in the case 1). > > Okay, I see now, thanks for the details! > > Maybe instead of "break silently", we can return an explicit error for > SET_VRING_BASE/GET_VRING_BASE in stable branches. > But if there are not many cases, we can leave it like that. A second thought, if we need to do something for stable. is it better if we just backport Shannon's series to stable? > > I was just concerned about how does the user space understand that it > can use SET_VRING_BASE/GET_VRING_BASE for PACKED virtqueues in a given > kernel or not. My understanding is that if packed is advertised, the application should assume SET/GET_VRING_BASE work. Thanks > > Thanks, > Stefano > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature
On Thu, Jun 8, 2023 at 10:23 PM Michael S. Tsirkin wrote: > > On Thu, Jun 08, 2023 at 05:29:58PM +0800, Jason Wang wrote: > > On Thu, Jun 8, 2023 at 5:21 PM Stefano Garzarella > > wrote: > > > > > > On Thu, Jun 08, 2023 at 05:00:00PM +0800, Jason Wang wrote: > > > >On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella > > > >wrote: > > > >> > > > >> On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote: > > > >> > > > >> [...] > > > >> > > > >> >> > > > > I have a question though, what if down the road there > > > >> >> > > > > is a new feature that needs more changes? It will be > > > >> >> > > > > broken too just like PACKED no? > > > >> >> > > > > Shouldn't vdpa have an allowlist of features it knows how > > > >> >> > > > > to support? > > > >> >> > > > > > > >> >> > > > It looks like we had it, but we took it out (by the way, we > > > >> >> > > > were > > > >> >> > > > enabling packed even though we didn't support it): > > > >> >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b > > > >> >> > > > > > > >> >> > > > The only problem I see is that for each new feature we have > > > >> >> > > > to modify > > > >> >> > > > the kernel. > > > >> >> > > > Could we have new features that don't require handling by > > > >> >> > > > vhost-vdpa? > > > >> >> > > > > > > >> >> > > > Thanks, > > > >> >> > > > Stefano > > > >> >> > > > > > >> >> > > Jason what do you say to reverting this? > > > >> >> > > > > >> >> > I may miss something but I don't see any problem with vDPA core. > > > >> >> > > > > >> >> > It's the duty of the parents to advertise the features it has. > > > >> >> > For example, > > > >> >> > > > > >> >> > 1) If some kernel version that is packed is not supported via > > > >> >> > set_vq_state, parents should not advertise PACKED features in this > > > >> >> > case. > > > >> >> > 2) If the kernel has support packed set_vq_state(), but it's > > > >> >> > emulated > > > >> >> > cvq doesn't support, parents should not advertise PACKED as well > > > >> >> > > > > >> >> > If a parent violates the above 2, it looks like a bug of the > > > >> >> > parents. > > > >> >> > > > > >> >> > Thanks > > > >> >> > > > >> >> Yes but what about vhost_vdpa? Talking about that not the core. > > > >> > > > > >> >Not sure it's a good idea to workaround parent bugs via vhost-vDPA. > > > >> > > > >> Sorry, I'm getting lost... > > > >> We were talking about the fact that vhost-vdpa doesn't handle > > > >> SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before > > > >> that series [1], no? > > > >> > > > >> The parents seem okay, but maybe I missed a few things. > > > >> > > > >> [1] > > > >> https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/ > > > > > > > >Yes, more below. > > > > > > > >> > > > >> > > > > >> >> Should that not have a whitelist of features > > > >> >> since it interprets ioctls differently depending on this? > > > >> > > > > >> >If there's a bug, it might only matter the following setup: > > > >> > > > > >> >SET_VRING_BASE/GET_VRING_BASE + VDUSE. > > > >> > > > > >> >This seems to be broken since VDUSE was introduced. If we really want > > > >> >to backport something, it could be a fix to filter out P
Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config
On Fri, Jun 9, 2023 at 3:45 AM Michael S. Tsirkin wrote: > > On Thu, Jun 08, 2023 at 05:01:24PM +0800, Angus Chen wrote: > > When add virtio_pci vdpa device,check the vqs number of device cap > > and max_vq_pairs from add_config. > > Simply starting from failing if the provisioned #qp is not > > equal to the one that hardware has. I think I kind of agree with Michael, I don't see any obvious advantages to allow usersapce to configure max_vqp if it can't be provisioned dynamically. What's wrong if we just stick the current approach that doesn't accept max_vqp? A better approach is to tweak the vdpa tool to display the legal attributes that can be provisioned. > > > > Signed-off-by: Angus Chen > > I am not sure about this one. How does userspace know > which values are legal? vdpa mgmtdev show can gives hints like: max_supported_vqs 3 > > If there's no way then maybe we should just cap the value > to what device can support but otherwise keep the device > working. This seems conflict to how other drivers (like mlx5) did: if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { if (add_config->net.max_vq_pairs > max_vqs / 2) return -EINVAL; max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs); } else { max_vqs = 2; } Thanks > > > --- > > v1: Use max_vqs from add_config > > v2: Just return fail if max_vqs from add_config is not same as device > > cap. Suggested by jason. > > > > drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++- > > 1 file changed, 21 insertions(+), 14 deletions(-) > > > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > > b/drivers/vdpa/virtio_pci/vp_vdpa.c > > index 281287fae89f..c1fb6963da12 100644 > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev > > *v_mdev, const char *name, > > u64 device_features; > > int ret, i; > > > > - vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > - dev, &vp_vdpa_ops, 1, 1, name, false); > > - > > - if (IS_ERR(vp_vdpa)) { > > - dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); > > - return PTR_ERR(vp_vdpa); > > + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > > + if (add_config->net.max_vq_pairs != > > (v_mdev->max_supported_vqs / 2)) { > > + dev_err(&pdev->dev, "max vqs 0x%x should be equal to > > 0x%x which device has\n", > > + add_config->net.max_vq_pairs*2, > > v_mdev->max_supported_vqs); > > + return -EINVAL; > > + } > > } > > > > - vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > - > > - vp_vdpa->vdpa.dma_dev = &pdev->dev; > > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > > - vp_vdpa->mdev = mdev; > > - > > device_features = vp_modern_get_features(mdev); > > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { > > if (add_config->device_features & ~device_features) { > > - ret = -EINVAL; > > dev_err(&pdev->dev, "Try to provision features " > > "that are not supported by the device: " > > "device_features 0x%llx provisioned 0x%llx\n", > > device_features, add_config->device_features); > > - goto err; > > + return -EINVAL; > > } > > device_features = add_config->device_features; > > } > > + > > + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > + dev, &vp_vdpa_ops, 1, 1, name, false); > > + > > + if (IS_ERR(vp_vdpa)) { > > + dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); > > + return PTR_ERR(vp_vdpa); > > + } > > + > > + vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > + > > + vp_vdpa->vdpa.dma_dev = &pdev->dev; > > + vp_vdpa->queues = v_mdev->max_supported_vqs; > > + vp_vdpa->mdev = mdev; > > vp_vdpa->device_features = device_features; > > > > ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev); > > -- > > 2.25.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vdpa/mlx5: Support interrupt bypassing
On Thu, Jun 8, 2023 at 3:01 AM Dragos Tatulea wrote: > > From: Eli Cohen > > Add support for generation of interrupts from the device directly to the > VM to the VCPU thus avoiding the overhead on the host CPU. > > When supported, the driver will attempt to allocate vectors for each > data virtqueue. If a vector for a virtqueue cannot be provided it will > use the QP mode where notifications go through the driver. > > In addition, we add a shutdown callback to make sure allocated > interrupts are released in case of shutdown to allow clean shutdown. > > Signed-off-by: Eli Cohen > Signed-off-by: Saeed Mahameed Acked-by: Jason Wang Thanks > --- > drivers/vdpa/mlx5/net/mlx5_vnet.c | 165 -- > drivers/vdpa/mlx5/net/mlx5_vnet.h | 15 +++ > 2 files changed, 171 insertions(+), 9 deletions(-) > > diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c > b/drivers/vdpa/mlx5/net/mlx5_vnet.c > index 279ac6a558d2..9138ef2fb2c8 100644 > --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c > +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c > @@ -83,6 +83,7 @@ struct mlx5_vq_restore_info { > u64 driver_addr; > u16 avail_index; > u16 used_index; > + struct msi_map map; > bool ready; > bool restore; > }; > @@ -118,6 +119,7 @@ struct mlx5_vdpa_virtqueue { > u16 avail_idx; > u16 used_idx; > int fw_state; > + struct msi_map map; > > /* keep last in the struct */ > struct mlx5_vq_restore_info ri; > @@ -808,6 +810,13 @@ static bool counters_supported(const struct > mlx5_vdpa_dev *mvdev) >BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS); > } > > +static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev) > +{ > + return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) & > + (1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) && > + pci_msix_can_alloc_dyn(mvdev->mdev->pdev); > +} > + > static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct > mlx5_vdpa_virtqueue *mvq) > { > int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); > @@ -849,9 +858,15 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, > struct mlx5_vdpa_virtque > if (vq_is_tx(mvq->index)) > MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, > ndev->res.tisn); > > - MLX5_SET(virtio_q, vq_ctx, event_mode, > MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); > + if (mvq->map.virq) { > + MLX5_SET(virtio_q, vq_ctx, event_mode, > MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE); > + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index); > + } else { > + MLX5_SET(virtio_q, vq_ctx, event_mode, > MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE); > + MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, > mvq->fwqp.mqp.qpn); > + } > + > MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index); > - MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn); > MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent); > MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0, > !!(ndev->mvdev.actual_features & > BIT_ULL(VIRTIO_F_VERSION_1))); > @@ -1194,6 +1209,56 @@ static void counter_set_dealloc(struct mlx5_vdpa_net > *ndev, struct mlx5_vdpa_vir > mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", > mvq->counter_set_id); > } > > +static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv) > +{ > + struct vdpa_callback *cb = priv; > + > + if (cb->callback) > + return cb->callback(cb->private); > + > + return IRQ_HANDLED; > +} > + > +static void alloc_vector(struct mlx5_vdpa_net *ndev, > +struct mlx5_vdpa_virtqueue *mvq) > +{ > + struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp; > + struct mlx5_vdpa_irq_pool_entry *ent; > + int err; > + int i; > + > + for (i = 0; i < irqp->num_ent; i++) { > + ent = &irqp->entries[i]; > + if (!ent->used) { > + snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, > "%s-vq-%d", > +dev_name(&ndev->mvdev.vdev.dev), mvq->index); > + ent->dev_id = &ndev->event_cbs[mvq->index]; > + err = request_irq(ent->map.virq, > mlx5_vdpa_int_handler, 0, > + ent->name, ent->dev_id); > + if (err) > +
Re: [PATCH v2] vduse: fix NULL pointer dereference
On Fri, Jun 23, 2023 at 4:49 AM Maxime Coquelin wrote: > > vduse_vdpa_set_vq_affinity callback can be called > with NULL value as cpu_mask when deleting the vduse > device. > > This patch resets virtqueue's IRQ affinity mask value > to set all CPUs instead of dereferencing NULL cpu_mask. > > [ 4760.952149] BUG: kernel NULL pointer dereference, address: > [ 4760.959110] #PF: supervisor read access in kernel mode > [ 4760.964247] #PF: error_code(0x) - not-present page > [ 4760.969385] PGD 0 P4D 0 > [ 4760.971927] Oops: [#1] PREEMPT SMP PTI > [ 4760.976112] CPU: 13 PID: 2346 Comm: vdpa Not tainted 6.4.0-rc6+ #4 > [ 4760.982291] Hardware name: Dell Inc. PowerEdge R640/0W23H8, BIOS 2.8.1 > 06/26/2020 > [ 4760.989769] RIP: 0010:memcpy_orig+0xc5/0x130 > [ 4760.994049] Code: 16 f8 4c 89 07 4c 89 4f 08 4c 89 54 17 f0 4c 89 5c 17 f8 > c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 83 fa 08 72 1b <4c> 8b > 06 4c 8b 4c 16 f8 4c 89 07 4c 89 4c 17 f8 c3 cc cc cc cc 66 > [ 4761.012793] RSP: 0018:b1d565abb830 EFLAGS: 00010246 > [ 4761.018020] RAX: 9f4bf6b27898 RBX: 9f4be23969c0 RCX: > 9f4bcadf6400 > [ 4761.025152] RDX: 0008 RSI: RDI: > 9f4bf6b27898 > [ 4761.032286] RBP: R08: 0008 R09: > > [ 4761.039416] R10: R11: 0600 R12: > > [ 4761.046549] R13: R14: 0080 R15: > b1d565abbb10 > [ 4761.053680] FS: 7f64c2ec2740() GS:9f635f98() > knlGS: > [ 4761.061765] CS: 0010 DS: ES: CR0: 80050033 > [ 4761.067513] CR2: CR3: 001875270006 CR4: > 007706e0 > [ 4761.074645] DR0: DR1: DR2: > > [ 4761.081775] DR3: DR6: fffe0ff0 DR7: > 0400 > [ 4761.088909] PKRU: 5554 > [ 4761.091620] Call Trace: > [ 4761.094074] > [ 4761.096180] ? __die+0x1f/0x70 > [ 4761.099238] ? page_fault_oops+0x171/0x4f0 > [ 4761.103340] ? exc_page_fault+0x7b/0x180 > [ 4761.107265] ? asm_exc_page_fault+0x22/0x30 > [ 4761.111460] ? memcpy_orig+0xc5/0x130 > [ 4761.115126] vduse_vdpa_set_vq_affinity+0x3e/0x50 [vduse] > [ 4761.120533] virtnet_clean_affinity.part.0+0x3d/0x90 [virtio_net] > [ 4761.126635] remove_vq_common+0x1a4/0x250 [virtio_net] > [ 4761.131781] virtnet_remove+0x5d/0x70 [virtio_net] > [ 4761.136580] virtio_dev_remove+0x3a/0x90 > [ 4761.140509] device_release_driver_internal+0x19b/0x200 > [ 4761.145742] bus_remove_device+0xc2/0x130 > [ 4761.149755] device_del+0x158/0x3e0 > [ 4761.153245] ? kernfs_find_ns+0x35/0xc0 > [ 4761.157086] device_unregister+0x13/0x60 > [ 4761.161010] unregister_virtio_device+0x11/0x20 > [ 4761.165543] device_release_driver_internal+0x19b/0x200 > [ 4761.170770] bus_remove_device+0xc2/0x130 > [ 4761.174782] device_del+0x158/0x3e0 > [ 4761.178276] ? __pfx_vdpa_name_match+0x10/0x10 [vdpa] > [ 4761.183336] device_unregister+0x13/0x60 > [ 4761.187260] vdpa_nl_cmd_dev_del_set_doit+0x63/0xe0 [vdpa] > > Fixes: 28f6288eb63d ("vduse: Support set_vq_affinity callback") > Cc: xieyon...@bytedance.com > > Signed-off-by: Maxime Coquelin Acked-by: Jason Wang Thanks > --- > drivers/vdpa/vdpa_user/vduse_dev.c | 6 +- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c > b/drivers/vdpa/vdpa_user/vduse_dev.c > index 5f5c21674fdc..0d84e6a9c3cc 100644 > --- a/drivers/vdpa/vdpa_user/vduse_dev.c > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c > @@ -726,7 +726,11 @@ static int vduse_vdpa_set_vq_affinity(struct vdpa_device > *vdpa, u16 idx, > { > struct vduse_dev *dev = vdpa_to_vduse(vdpa); > > - cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask); > + if (cpu_mask) > + cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask); > + else > + cpumask_setall(&dev->vqs[idx]->irq_affinity); > + > return 0; > } > > -- > 2.41.0 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v10 00/10] virtio core prepares for AF_XDP
On Wed, Jun 21, 2023 at 2:43 PM Xuan Zhuo wrote: > > Hi Jason, > > Do you have plan to review this? Just came back from vacation, will do this next week. Thanks > > Thanks. > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config
On Thu, Jun 8, 2023 at 5:02 PM Angus Chen wrote: > > When add virtio_pci vdpa device,check the vqs number of device cap > and max_vq_pairs from add_config. > Simply starting from failing if the provisioned #qp is not > equal to the one that hardware has. > > Signed-off-by: Angus Chen > --- > v1: Use max_vqs from add_config > v2: Just return fail if max_vqs from add_config is not same as device > cap. Suggested by jason. > > drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++- > 1 file changed, 21 insertions(+), 14 deletions(-) > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > b/drivers/vdpa/virtio_pci/vp_vdpa.c > index 281287fae89f..c1fb6963da12 100644 > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev > *v_mdev, const char *name, > u64 device_features; > int ret, i; > > - vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > - dev, &vp_vdpa_ops, 1, 1, name, false); > - > - if (IS_ERR(vp_vdpa)) { > - dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); > - return PTR_ERR(vp_vdpa); > + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > + if (add_config->net.max_vq_pairs != > (v_mdev->max_supported_vqs / 2)) { > + dev_err(&pdev->dev, "max vqs 0x%x should be equal to > 0x%x which device has\n", > + add_config->net.max_vq_pairs*2, > v_mdev->max_supported_vqs); > + return -EINVAL; > + } > } > > - vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > - > - vp_vdpa->vdpa.dma_dev = &pdev->dev; > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > - vp_vdpa->mdev = mdev; > - > device_features = vp_modern_get_features(mdev); > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { > if (add_config->device_features & ~device_features) { > - ret = -EINVAL; > dev_err(&pdev->dev, "Try to provision features " > "that are not supported by the device: " > "device_features 0x%llx provisioned 0x%llx\n", > device_features, add_config->device_features); > - goto err; > + return -EINVAL; > } > device_features = add_config->device_features; > } > + > + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > + dev, &vp_vdpa_ops, 1, 1, name, false); > + > + if (IS_ERR(vp_vdpa)) { > + dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); > + return PTR_ERR(vp_vdpa); > + } > + > + vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > + > + vp_vdpa->vdpa.dma_dev = &pdev->dev; > + vp_vdpa->queues = v_mdev->max_supported_vqs; Why bother with those changes? mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev); Thanks > + vp_vdpa->mdev = mdev; > vp_vdpa->device_features = device_features; > > ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev); > -- > 2.25.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 1/3] vDPA/ifcvf: dynamic allocate vq data stores
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan wrote: > > This commit dynamically allocates the data > stores for the virtqueues based on > virtio_pci_common_cfg.num_queues. While at it, it's better to allocate vring_lm_cfg as well and drop IFCVF_MAX_QUEUES. Thanks > > Signed-off-by: Zhu Lingshan > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 3 +++ > drivers/vdpa/ifcvf/ifcvf_base.h | 2 +- > drivers/vdpa/ifcvf/ifcvf_main.c | 2 ++ > 3 files changed, 6 insertions(+), 1 deletion(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index 1b5da11f5403..f86495ace825 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -134,6 +134,9 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev > *pdev) > } > > hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues); > + hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, > GFP_KERNEL); > + if (!hw->vring) > + return -ENOMEM; > > for (i = 0; i < hw->nr_vring; i++) { > vp_iowrite16(i, &hw->common_cfg->queue_select); > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index 3110ffc50caf..fa797184056b 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -74,7 +74,7 @@ struct ifcvf_hw { > u64 dev_features; > struct virtio_pci_common_cfg __iomem *common_cfg; > void __iomem *dev_cfg; > - struct vring_info vring[IFCVF_MAX_QUEUES]; > + struct vring_info *vring; > void __iomem * const *base; > char config_msix_name[256]; > struct vdpa_callback config_cb; > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c > index 6e47ac2c669a..2af0de771b49 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_main.c > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > @@ -830,6 +830,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct > pci_device_id *id) > return 0; > > err: > + kfree(ifcvf_mgmt_dev->vf.vring); > kfree(ifcvf_mgmt_dev); > return ret; > } > @@ -840,6 +841,7 @@ static void ifcvf_remove(struct pci_dev *pdev) > > ifcvf_mgmt_dev = pci_get_drvdata(pdev); > vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev); > + kfree(ifcvf_mgmt_dev->vf.vring); > kfree(ifcvf_mgmt_dev); > } > > -- > 2.39.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 2/3] vDPA/ifcvf: detect and report max allowed vq size
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan wrote: > > Rather than a hardcode, this commit detects > and reports the max value of allowed size > of the virtqueues > > Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Thanks > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 31 +++ > drivers/vdpa/ifcvf/ifcvf_base.h | 2 +- > drivers/vdpa/ifcvf/ifcvf_main.c | 4 +++- > 3 files changed, 35 insertions(+), 2 deletions(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index f86495ace825..f4d7d96c4c86 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -69,6 +69,37 @@ static int ifcvf_read_config_range(struct pci_dev *dev, > return 0; > } > > +static u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid) > +{ > + u16 queue_size; > + > + vp_iowrite16(qid, &hw->common_cfg->queue_select); > + queue_size = vp_ioread16(&hw->common_cfg->queue_size); > + > + return queue_size; > +} > + > +/* This function returns the max allowed safe size for > + * all virtqueues. It is the minimal size that can be > + * suppprted by all virtqueues. > + */ > +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw) > +{ > + u16 queue_size, max_size, qid; > + > + max_size = ifcvf_get_vq_size(hw, 0); > + for (qid = 1; qid < hw->nr_vring; qid++) { > + queue_size = ifcvf_get_vq_size(hw, qid); > + /* 0 means the queue is unavailable */ > + if (!queue_size) > + continue; > + > + max_size = min(queue_size, max_size); > + } > + > + return max_size; > +} > + > int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) > { > struct virtio_pci_cap cap; > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index fa797184056b..30935a95b672 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -28,7 +28,6 @@ > #define IFCVF_MAX_QUEUES 17 > > #define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE > -#define IFCVF_QUEUE_MAX32768 > #define IFCVF_PCI_MAX_RESOURCE 6 > > #define IFCVF_LM_CFG_SIZE 0x40 > @@ -138,4 +137,5 @@ bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid); > void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready); > void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features); > u64 ifcvf_get_driver_features(struct ifcvf_hw *hw); > +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw); > #endif /* _IFCVF_H_ */ > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c > index 2af0de771b49..c3ece395caf7 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_main.c > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > @@ -451,7 +451,9 @@ static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev) > > static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) > { > - return IFCVF_QUEUE_MAX; > + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); > + > + return ifcvf_get_max_vq_size(vf); > } > > static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid, > -- > 2.39.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 3/3] vDPA/ifcvf: implement new accessors for vq_state
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan wrote: > > This commit implements a better layout of the > live migration bar, therefore the accessors for virtqueue > state have been refactored. I guess the reason for F2000X is that it can report a #vq which is greater than IFCVF_MAX_QUEUES. If yes, let's explain it in the changelog. Thanks > > This commit also add a comment to the probing-ids list, > indicating this driver drives F2000X-PL virtio-net > > Signed-off-by: Zhu Lingshan > --- > drivers/vdpa/ifcvf/ifcvf_base.c | 21 + > drivers/vdpa/ifcvf/ifcvf_base.h | 25 + > drivers/vdpa/ifcvf/ifcvf_main.c | 4 +++- > 3 files changed, 17 insertions(+), 33 deletions(-) > > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c > index f4d7d96c4c86..060f837a4f9f 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.c > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > @@ -328,30 +328,19 @@ void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 > features) > > u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid) > { > - struct ifcvf_lm_cfg __iomem *ifcvf_lm; > - void __iomem *avail_idx_addr; > + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; > u16 last_avail_idx; > - u32 q_pair_id; > > - ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; > - q_pair_id = qid / 2; > - avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; > - last_avail_idx = vp_ioread16(avail_idx_addr); > + last_avail_idx = vp_ioread16(&lm_cfg->vq_state_region + qid * 2); > > return last_avail_idx; > } > > int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num) > { > - struct ifcvf_lm_cfg __iomem *ifcvf_lm; > - void __iomem *avail_idx_addr; > - u32 q_pair_id; > - > - ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; > - q_pair_id = qid / 2; > - avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; > - hw->vring[qid].last_avail_idx = num; > - vp_iowrite16(num, avail_idx_addr); > + struct ifcvf_lm_cfg __iomem *lm_cfg = hw->lm_cfg; > + > + vp_iowrite16(num, &lm_cfg->vq_state_region + qid * 2); > > return 0; > } > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h > index 30935a95b672..b57849c643f6 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_base.h > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > @@ -24,14 +24,9 @@ > #define N3000_DEVICE_ID0x1041 > #define N3000_SUBSYS_DEVICE_ID 0x001A > > -/* Max 8 data queue pairs(16 queues) and one control vq for now. */ > -#define IFCVF_MAX_QUEUES 17 > - > #define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE > #define IFCVF_PCI_MAX_RESOURCE 6 > > -#define IFCVF_LM_CFG_SIZE 0x40 > -#define IFCVF_LM_RING_STATE_OFFSET 0x20 > #define IFCVF_LM_BAR 4 > > #define IFCVF_ERR(pdev, fmt, ...) dev_err(&pdev->dev, fmt, > ##__VA_ARGS__) > @@ -54,10 +49,18 @@ struct vring_info { > char msix_name[256]; > }; > > +struct ifcvf_lm_cfg { > + __le64 control; > + __le64 status; > + __le64 lm_mem_log_start_addr; > + __le64 lm_mem_log_end_addr; > + __le16 vq_state_region; > +}; > + > struct ifcvf_hw { > u8 __iomem *isr; > /* Live migration */ > - u8 __iomem *lm_cfg; > + struct ifcvf_lm_cfg __iomem *lm_cfg; > /* Notification bar number */ > u8 notify_bar; > u8 msix_vector_status; > @@ -92,16 +95,6 @@ struct ifcvf_adapter { > struct ifcvf_hw *vf; > }; > > -struct ifcvf_vring_lm_cfg { > - u32 idx_addr[2]; > - u8 reserved[IFCVF_LM_CFG_SIZE - 8]; > -}; > - > -struct ifcvf_lm_cfg { > - u8 reserved[IFCVF_LM_RING_STATE_OFFSET]; > - struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES]; > -}; > - > struct ifcvf_vdpa_mgmt_dev { > struct vdpa_mgmt_dev mdev; > struct ifcvf_hw vf; > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c > index c3ece395caf7..e98fa8100f3c 100644 > --- a/drivers/vdpa/ifcvf/ifcvf_main.c > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > @@ -853,7 +853,9 @@ static struct pci_device_id ifcvf_pci_ids[] = { > N3000_DEVICE_ID, > PCI_VENDOR_ID_INTEL, > N3000_SUBSYS_DEVICE_ID) }, > - /* C5000X-PL network device */ > + /* C5000X-PL network device > +* F2000X-PL network device > +*/ > { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, > VIRTIO_TRANS_ID_NET, > PCI_VENDOR_ID_INTEL, > -- > 2.39.1 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 1/3] vDPA/ifcvf: dynamic allocate vq data stores
On Mon, Jun 26, 2023 at 10:38 AM Zhu, Lingshan wrote: > > > > On 6/26/2023 10:32 AM, Jason Wang wrote: > > On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan wrote: > >> This commit dynamically allocates the data > >> stores for the virtqueues based on > >> virtio_pci_common_cfg.num_queues. > > While at it, it's better to allocate vring_lm_cfg as well and drop > > IFCVF_MAX_QUEUES. > Yes, this has been done in 3/3 patch in this series. Ok, yes, but it seems patch 3 implements a lot of logic so I suggest moving it to patch 1. Not sure it's too late since I see the patch has been merged by Michael. Thanks > > Thanks > Zhu Lingshan > > > > Thanks > > > >> Signed-off-by: Zhu Lingshan > >> --- > >> drivers/vdpa/ifcvf/ifcvf_base.c | 3 +++ > >> drivers/vdpa/ifcvf/ifcvf_base.h | 2 +- > >> drivers/vdpa/ifcvf/ifcvf_main.c | 2 ++ > >> 3 files changed, 6 insertions(+), 1 deletion(-) > >> > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c > >> b/drivers/vdpa/ifcvf/ifcvf_base.c > >> index 1b5da11f5403..f86495ace825 100644 > >> --- a/drivers/vdpa/ifcvf/ifcvf_base.c > >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c > >> @@ -134,6 +134,9 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev > >> *pdev) > >> } > >> > >> hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues); > >> + hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, > >> GFP_KERNEL); > >> + if (!hw->vring) > >> + return -ENOMEM; > >> > >> for (i = 0; i < hw->nr_vring; i++) { > >> vp_iowrite16(i, &hw->common_cfg->queue_select); > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h > >> b/drivers/vdpa/ifcvf/ifcvf_base.h > >> index 3110ffc50caf..fa797184056b 100644 > >> --- a/drivers/vdpa/ifcvf/ifcvf_base.h > >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h > >> @@ -74,7 +74,7 @@ struct ifcvf_hw { > >> u64 dev_features; > >> struct virtio_pci_common_cfg __iomem *common_cfg; > >> void __iomem *dev_cfg; > >> - struct vring_info vring[IFCVF_MAX_QUEUES]; > >> + struct vring_info *vring; > >> void __iomem * const *base; > >> char config_msix_name[256]; > >> struct vdpa_callback config_cb; > >> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c > >> b/drivers/vdpa/ifcvf/ifcvf_main.c > >> index 6e47ac2c669a..2af0de771b49 100644 > >> --- a/drivers/vdpa/ifcvf/ifcvf_main.c > >> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c > >> @@ -830,6 +830,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const > >> struct pci_device_id *id) > >> return 0; > >> > >> err: > >> + kfree(ifcvf_mgmt_dev->vf.vring); > >> kfree(ifcvf_mgmt_dev); > >> return ret; > >> } > >> @@ -840,6 +841,7 @@ static void ifcvf_remove(struct pci_dev *pdev) > >> > >> ifcvf_mgmt_dev = pci_get_drvdata(pdev); > >> vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev); > >> + kfree(ifcvf_mgmt_dev->vf.vring); > >> kfree(ifcvf_mgmt_dev); > >> } > >> > >> -- > >> 2.39.1 > >> > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config
On Mon, Jun 26, 2023 at 10:42 AM Angus Chen wrote: > > > Hi,jason. > > -Original Message- > > From: Jason Wang > > Sent: Monday, June 26, 2023 10:30 AM > > To: Angus Chen > > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org; > > linux-ker...@vger.kernel.org > > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from > > add_config > > > > On Thu, Jun 8, 2023 at 5:02 PM Angus Chen > > wrote: > > > > > > When add virtio_pci vdpa device,check the vqs number of device cap > > > and max_vq_pairs from add_config. > > > Simply starting from failing if the provisioned #qp is not > > > equal to the one that hardware has. > > > > > > Signed-off-by: Angus Chen > > > --- > > > v1: Use max_vqs from add_config > > > v2: Just return fail if max_vqs from add_config is not same as device > > > cap. Suggested by jason. > > > > > > drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++- > > > 1 file changed, 21 insertions(+), 14 deletions(-) > > > > > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > > b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > index 281287fae89f..c1fb6963da12 100644 > > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct > > vdpa_mgmt_dev *v_mdev, const char *name, > > > u64 device_features; > > > int ret, i; > > > > > > - vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > > - dev, &vp_vdpa_ops, 1, 1, name, > > false); > > > - > > > - if (IS_ERR(vp_vdpa)) { > > > - dev_err(dev, "vp_vdpa: Failed to allocate vDPA > > structure\n"); > > > - return PTR_ERR(vp_vdpa); > > > + if (add_config->mask & > > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > > > + if (add_config->net.max_vq_pairs != > > (v_mdev->max_supported_vqs / 2)) { > > > + dev_err(&pdev->dev, "max vqs 0x%x should be > > equal to 0x%x which device has\n", > > > + add_config->net.max_vq_pairs*2, > > v_mdev->max_supported_vqs); > > > + return -EINVAL; > > > + } > > > } > > > > > > - vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > > - > > > - vp_vdpa->vdpa.dma_dev = &pdev->dev; > > > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > > > - vp_vdpa->mdev = mdev; > > > - > > > device_features = vp_modern_get_features(mdev); > > > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) { > > > if (add_config->device_features & ~device_features) { > > > - ret = -EINVAL; > > > dev_err(&pdev->dev, "Try to provision features > > " > > > "that are not supported by the device: > > " > > > "device_features 0x%llx provisioned > > 0x%llx\n", > > > device_features, > > add_config->device_features); > > > - goto err; > > > + return -EINVAL; > > > } > > > device_features = add_config->device_features; > > > } > > > + > > > + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > > + dev, &vp_vdpa_ops, 1, 1, name, > > false); > > > + > > > + if (IS_ERR(vp_vdpa)) { > > > + dev_err(dev, "vp_vdpa: Failed to allocate vDPA > > structure\n"); > > > + return PTR_ERR(vp_vdpa); > > > + } > > > + > > > + vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > > + > > > + vp_vdpa->vdpa.dma_dev = &pdev->dev; > > > + vp_vdpa->queues = v_mdev->max_supported_vqs; > > > > Why bother with those changes? > > > > mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev); > max_supported_vqs will not be changed, so we can get max_supported_vqs from > mgtdev->max_supported_vqs. > If we use vp_modern_get_num_queues(mdev),it will use tlp to communicate with > device. > It just reduce some tlp . Ok, but 1) I think we don't care the performance here 2) If we did, let's use a separate patch to do that as an optimization Thanks > > > > Thanks > > > > > > > + vp_vdpa->mdev = mdev; > > > vp_vdpa->device_features = device_features; > > > > > > ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, > > pdev); > > > -- > > > 2.25.1 > > > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config
On Mon, Jun 26, 2023 at 11:02 AM Angus Chen wrote: > > > > > -Original Message- > > From: Jason Wang > > Sent: Monday, June 26, 2023 10:51 AM > > To: Angus Chen > > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org; > > linux-ker...@vger.kernel.org > > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from > > add_config > > > > On Mon, Jun 26, 2023 at 10:42 AM Angus Chen > > wrote: > > > > > > > > > Hi,jason. > > > > -Original Message- > > > > From: Jason Wang > > > > Sent: Monday, June 26, 2023 10:30 AM > > > > To: Angus Chen > > > > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org; > > > > linux-ker...@vger.kernel.org > > > > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device > > from > > > > add_config > > > > > > > > On Thu, Jun 8, 2023 at 5:02 PM Angus Chen > > > > > > wrote: > > > > > > > > > > When add virtio_pci vdpa device,check the vqs number of device cap > > > > > and max_vq_pairs from add_config. > > > > > Simply starting from failing if the provisioned #qp is not > > > > > equal to the one that hardware has. > > > > > > > > > > Signed-off-by: Angus Chen > > > > > --- > > > > > v1: Use max_vqs from add_config > > > > > v2: Just return fail if max_vqs from add_config is not same as device > > > > > cap. Suggested by jason. > > > > > > > > > > drivers/vdpa/virtio_pci/vp_vdpa.c | 35 > > > > > ++- > > > > > 1 file changed, 21 insertions(+), 14 deletions(-) > > > > > > > > > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c > > > > b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > > > index 281287fae89f..c1fb6963da12 100644 > > > > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c > > > > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c > > > > > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct > > > > vdpa_mgmt_dev *v_mdev, const char *name, > > > > > u64 device_features; > > > > > int ret, i; > > > > > > > > > > - vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, > > > > > - dev, &vp_vdpa_ops, 1, 1, > > name, > > > > false); > > > > > - > > > > > - if (IS_ERR(vp_vdpa)) { > > > > > - dev_err(dev, "vp_vdpa: Failed to allocate vDPA > > > > structure\n"); > > > > > - return PTR_ERR(vp_vdpa); > > > > > + if (add_config->mask & > > > > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) { > > > > > + if (add_config->net.max_vq_pairs != > > > > (v_mdev->max_supported_vqs / 2)) { > > > > > + dev_err(&pdev->dev, "max vqs 0x%x should > > be > > > > equal to 0x%x which device has\n", > > > > > + add_config->net.max_vq_pairs*2, > > > > v_mdev->max_supported_vqs); > > > > > + return -EINVAL; > > > > > + } > > > > > } > > > > > > > > > > - vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; > > > > > - > > > > > - vp_vdpa->vdpa.dma_dev = &pdev->dev; > > > > > - vp_vdpa->queues = vp_modern_get_num_queues(mdev); > > > > > - vp_vdpa->mdev = mdev; > > > > > - > > > > > device_features = vp_modern_get_features(mdev); > > > > > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) > > { > > > > > if (add_config->device_features & ~device_features) { > > > > > - ret = -EINVAL; > > > > > dev_err(&pdev->dev, "Try to provision > > features > > > > " > > > > > "that are not supported by the > > device: > > > > " > > > > > "device_features 0x%llx > > provisioned > > > > 0x%llx\n", > > >
Re: [PATCH vhost v10 01/10] virtio_ring: put mapping error check in vring_map_one_sg
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo wrote: > > This patch put the dma addr error check in vring_map_one_sg(). > > The benefits of doing this: > > 1. reduce one judgment of vq->use_dma_api. > 2. make vring_map_one_sg more simple, without calling >vring_mapping_error to check the return value. simplifies subsequent >code > > Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Thanks > --- > drivers/virtio/virtio_ring.c | 37 +--- > 1 file changed, 22 insertions(+), 15 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index c5310eaf8b46..72ed07a604d4 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -355,9 +355,8 @@ static struct device *vring_dma_dev(const struct > vring_virtqueue *vq) > } > > /* Map one sg entry. */ > -static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, > - struct scatterlist *sg, > - enum dma_data_direction direction) > +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct > scatterlist *sg, > + enum dma_data_direction direction, dma_addr_t > *addr) > { > if (!vq->use_dma_api) { > /* > @@ -366,7 +365,8 @@ static dma_addr_t vring_map_one_sg(const struct > vring_virtqueue *vq, > * depending on the direction. > */ > kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, > direction); > - return (dma_addr_t)sg_phys(sg); > + *addr = (dma_addr_t)sg_phys(sg); > + return 0; > } > > /* > @@ -374,9 +374,14 @@ static dma_addr_t vring_map_one_sg(const struct > vring_virtqueue *vq, > * the way it expects (we don't guarantee that the scatterlist > * will exist for the lifetime of the mapping). > */ > - return dma_map_page(vring_dma_dev(vq), > + *addr = dma_map_page(vring_dma_dev(vq), > sg_page(sg), sg->offset, sg->length, > direction); > + > + if (dma_mapping_error(vring_dma_dev(vq), *addr)) > + return -ENOMEM; > + > + return 0; > } > > static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, > @@ -588,8 +593,9 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > > for (n = 0; n < out_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, > DMA_TO_DEVICE); > - if (vring_mapping_error(vq, addr)) > + dma_addr_t addr; > + > + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) > goto unmap_release; > > prev = i; > @@ -603,8 +609,9 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > } > for (; n < (out_sgs + in_sgs); n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, > DMA_FROM_DEVICE); > - if (vring_mapping_error(vq, addr)) > + dma_addr_t addr; > + > + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) > goto unmap_release; > > prev = i; > @@ -1279,9 +1286,8 @@ static int virtqueue_add_indirect_packed(struct > vring_virtqueue *vq, > > for (n = 0; n < out_sgs + in_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - addr = vring_map_one_sg(vq, sg, n < out_sgs ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE); > - if (vring_mapping_error(vq, addr)) > + if (vring_map_one_sg(vq, sg, n < out_sgs ? > +DMA_TO_DEVICE : DMA_FROM_DEVICE, > &addr)) > goto unmap_release; > > desc[i].flags = cpu_to_le16(n < out_sgs ? > @@ -1426,9 +1432,10 @@ static inline int virtqueue_add_packed(struct > virtqueue *_vq, > c = 0; > for (n = 0; n < out_sgs + in_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - dma_addr_t addr = vring_map_one_sg(vq, sg, n < > out_sgs ? > - DMA_TO_DEVICE : DMA_FROM_DEVICE); > -
Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo wrote: > > This helper allows the driver change the dma mode to premapped mode. > Under the premapped mode, the virtio core do not do dma mapping > internally. > > This just work when the use_dma_api is true. If the use_dma_api is false, > the dma options is not through the DMA APIs, that is not the standard > way of the linux kernel. > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 40 > include/linux/virtio.h | 2 ++ > 2 files changed, 42 insertions(+) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 72ed07a604d4..2afdfb9e3e30 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -172,6 +172,9 @@ struct vring_virtqueue { > /* Host publishes avail event idx */ > bool event; > > + /* Do DMA mapping by driver */ > + bool premapped; > + > /* Head of free buffer list. */ > unsigned int free_head; > /* Number we've added since last sync. */ > @@ -2059,6 +2062,7 @@ static struct virtqueue *vring_create_virtqueue_packed( > vq->packed_ring = true; > vq->dma_dev = dma_dev; > vq->use_dma_api = vring_use_dma_api(vdev); > + vq->premapped = false; > > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) > && > !context; > @@ -2548,6 +2552,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned > int index, > #endif > vq->dma_dev = dma_dev; > vq->use_dma_api = vring_use_dma_api(vdev); > + vq->premapped = false; > > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) > && > !context; > @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, > } > EXPORT_SYMBOL_GPL(virtqueue_resize); > > +/** > + * virtqueue_set_premapped - set the vring premapped mode > + * @_vq: the struct virtqueue we're talking about. > + * > + * Enable the premapped mode of the vq. > + * > + * The vring in premapped mode does not do dma internally, so the driver must > + * do dma mapping in advance. The driver must pass the dma_address through > + * dma_address of scatterlist. When the driver got a used buffer from > + * the vring, it has to unmap the dma address. So the driver must call > + * virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped(). > + * > + * This must be called before adding any buf to vring. And any old buffer should be detached? > + * So this should be called immediately after init vq or vq reset. Any way to detect and warn in this case? (not a must if it's too expensive to do the check) > + * > + * Caller must ensure we don't call this with other virtqueue operations > + * at the same time (except where noted). > + * > + * Returns zero or a negative error. > + * 0: success. > + * -EINVAL: vring does not use the dma api, so we can not enable premapped > mode. > + */ > +int virtqueue_set_premapped(struct virtqueue *_vq) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + > + if (!vq->use_dma_api) > + return -EINVAL; > + > + vq->premapped = true; I guess there should be a way to disable it. Would it be useful for the case when AF_XDP sockets were destroyed? Thanks > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(virtqueue_set_premapped); > + > /* Only available for split ring */ > struct virtqueue *vring_new_virtqueue(unsigned int index, > unsigned int num, > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > index b93238db94e3..1fc0e1023bd4 100644 > --- a/include/linux/virtio.h > +++ b/include/linux/virtio.h > @@ -78,6 +78,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); > > unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq); > > +int virtqueue_set_premapped(struct virtqueue *_vq); > + > bool virtqueue_poll(struct virtqueue *vq, unsigned); > > bool virtqueue_enable_cb_delayed(struct virtqueue *vq); > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo wrote: > > If the vq is the premapped mode, use the sg_dma_address() directly. > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 46 ++-- > 1 file changed, 28 insertions(+), 18 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 2afdfb9e3e30..18212c3e056b 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > dma_addr_t addr; > > - if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) > - goto unmap_release; > + if (vq->premapped) { > + addr = sg_dma_address(sg); > + } else { > + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, > &addr)) > + goto unmap_release; > + } Btw, I wonder whether or not it would be simple to implement the vq->premapped check inside vring_map_one_sg() assuming the !use_dma_api is done there as well. > > prev = i; > /* Note that we trust indirect descriptor > @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > dma_addr_t addr; > > - if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) > - goto unmap_release; > + if (vq->premapped) { > + addr = sg_dma_address(sg); > + } else { > + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, > &addr)) > + goto unmap_release; > + } > > prev = i; > /* Note that we trust indirect descriptor > @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct virtqueue > *_vq, > return 0; > > unmap_release: > - err_idx = i; > + if (!vq->premapped) { Can vq->premapped be true here? The label is named as "unmap_relase" which implies "map" beforehand which seems not the case for premapping. Thanks > + err_idx = i; > > - if (indirect) > - i = 0; > - else > - i = head; > - > - for (n = 0; n < total_sg; n++) { > - if (i == err_idx) > - break; > - if (indirect) { > - vring_unmap_one_split_indirect(vq, &desc[i]); > - i = virtio16_to_cpu(_vq->vdev, desc[i].next); > - } else > - i = vring_unmap_one_split(vq, i); > + if (indirect) > + i = 0; > + else > + i = head; > + > + for (n = 0; n < total_sg; n++) { > + if (i == err_idx) > + break; > + if (indirect) { > + vring_unmap_one_split_indirect(vq, &desc[i]); > + i = virtio16_to_cpu(_vq->vdev, desc[i].next); > + } else > + i = vring_unmap_one_split(vq, i); > + } > } > > if (indirect) > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v10 04/10] virtio_ring: packed: support add premapped buf
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo wrote: > > If the vq is the premapped mode, use the sg_dma_address() directly. > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 36 ++-- > 1 file changed, 26 insertions(+), 10 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index 18212c3e056b..dc109fbc05a5 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -1299,9 +1299,13 @@ static int virtqueue_add_indirect_packed(struct > vring_virtqueue *vq, > > for (n = 0; n < out_sgs + in_sgs; n++) { > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > - if (vring_map_one_sg(vq, sg, n < out_sgs ? > -DMA_TO_DEVICE : DMA_FROM_DEVICE, > &addr)) > - goto unmap_release; > + if (vq->premapped) { > + addr = sg_dma_address(sg); > + } else { > + if (vring_map_one_sg(vq, sg, n < out_sgs ? > +DMA_TO_DEVICE : > DMA_FROM_DEVICE, &addr)) > + goto unmap_release; > + } > > desc[i].flags = cpu_to_le16(n < out_sgs ? > 0 : VRING_DESC_F_WRITE); > @@ -1369,10 +1373,12 @@ static int virtqueue_add_indirect_packed(struct > vring_virtqueue *vq, > return 0; > > unmap_release: > - err_idx = i; > + if (!vq->premapped) { > + err_idx = i; > > - for (i = 0; i < err_idx; i++) > - vring_unmap_desc_packed(vq, &desc[i]); > + for (i = 0; i < err_idx; i++) > + vring_unmap_desc_packed(vq, &desc[i]); > + } > > kfree(desc); > > @@ -1447,9 +1453,13 @@ static inline int virtqueue_add_packed(struct > virtqueue *_vq, > for (sg = sgs[n]; sg; sg = sg_next(sg)) { > dma_addr_t addr; > > - if (vring_map_one_sg(vq, sg, n < out_sgs ? > -DMA_TO_DEVICE : DMA_FROM_DEVICE, > &addr)) > - goto unmap_release; > + if (vq->premapped) { > + addr = sg_dma_address(sg); > + } else { > + if (vring_map_one_sg(vq, sg, n < out_sgs ? > +DMA_TO_DEVICE : > DMA_FROM_DEVICE, &addr)) > + goto unmap_release; > + } > > flags = cpu_to_le16(vq->packed.avail_used_flags | > (++c == total_sg ? 0 : VRING_DESC_F_NEXT) > | > @@ -1512,11 +1522,17 @@ static inline int virtqueue_add_packed(struct > virtqueue *_vq, > return 0; > > unmap_release: > + vq->packed.avail_used_flags = avail_used_flags; > + > + if (vq->premapped) { Similar to the split path, I think we can't hit vq->premapped here. Thanks > + END_USE(vq); > + return -EIO; > + } > + > err_idx = i; > i = head; > curr = vq->free_head; > > - vq->packed.avail_used_flags = avail_used_flags; > > for (n = 0; n < total_sg; n++) { > if (i == err_idx) > -- > 2.32.0.3.g01195cf9f > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH vhost v10 05/10] virtio_ring: split-detach: support return dma info to driver
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo wrote: > > Under the premapped mode, the driver needs to unmap the DMA address > after receiving the buffer. The virtio core records the DMA address, > so the driver needs a way to get the dma info from the virtio core. A second thought, can we simply offload the tracking to the driver itself? This looks the way many other modern NIC drivers did. In pre mapped mode, the DMA address is in fact told by the driver itself so it should have sufficient knowledge. And in some cases, the driver wants to optimize/merge/delay the unampping so the DMA addresses returned by the virtio core are not even interested in those cases. Thanks > > A straightforward approach is to pass an array to the virtio core when > calling virtqueue_get_buf(). However, it is not feasible when there are > multiple DMA addresses in the descriptor chain, and the array size is > unknown. > > To solve this problem, a helper be introduced. After calling > virtqueue_get_buf(), the driver can call the helper to > retrieve a dma info. If the helper function returns -EAGAIN, it means > that there are more DMA addresses to be processed, and the driver should > call the helper function again. To keep track of the current position in > the chain, a cursor must be passed to the helper function, which is > initialized by virtqueue_get_buf(). > > Some processes are done inside this helper, so this helper MUST be > called under the premapped mode. > > Signed-off-by: Xuan Zhuo > --- > drivers/virtio/virtio_ring.c | 118 --- > include/linux/virtio.h | 11 > 2 files changed, 119 insertions(+), 10 deletions(-) > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > index dc109fbc05a5..cdc4349f6066 100644 > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -754,8 +754,95 @@ static bool virtqueue_kick_prepare_split(struct > virtqueue *_vq) > return needs_kick; > } > > -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, > -void **ctx) > +static void detach_cursor_init_split(struct vring_virtqueue *vq, > +struct virtqueue_detach_cursor *cursor, > u16 head) > +{ > + struct vring_desc_extra *extra; > + > + extra = &vq->split.desc_extra[head]; > + > + /* Clear data ptr. */ > + vq->split.desc_state[head].data = NULL; > + > + cursor->head = head; > + cursor->done = 0; > + > + if (extra->flags & VRING_DESC_F_INDIRECT) { > + cursor->num = extra->len / sizeof(struct vring_desc); > + cursor->indirect = true; > + cursor->pos = 0; > + > + vring_unmap_one_split(vq, head); > + > + extra->next = vq->free_head; > + > + vq->free_head = head; > + > + /* Plus final descriptor */ > + vq->vq.num_free++; > + > + } else { > + cursor->indirect = false; > + cursor->pos = head; > + } > +} > + > +static int virtqueue_detach_split(struct virtqueue *_vq, struct > virtqueue_detach_cursor *cursor, > + dma_addr_t *addr, u32 *len, enum > dma_data_direction *dir) > +{ > + struct vring_virtqueue *vq = to_vvq(_vq); > + __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); > + int rc = -EAGAIN; > + > + if (unlikely(cursor->done)) > + return -EINVAL; > + > + if (!cursor->indirect) { > + struct vring_desc_extra *extra; > + unsigned int i; > + > + i = cursor->pos; > + > + extra = &vq->split.desc_extra[i]; > + > + if (vq->split.vring.desc[i].flags & nextflag) { > + cursor->pos = extra->next; > + } else { > + extra->next = vq->free_head; > + vq->free_head = cursor->head; > + cursor->done = true; > + rc = 0; > + } > + > + *addr = extra->addr; > + *len = extra->len; > + *dir = (extra->flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE > : DMA_TO_DEVICE; > + > + vq->vq.num_free++; > + > + } else { > + struct vring_desc *indir_desc, *desc; > + u16 flags; > + > + indir_desc = vq->split.desc_state[cursor->head].indir_desc; > + desc = &indir_desc[cursor->pos]; > + > + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); > + *addr = virtio64_to_cpu(vq->vq.vdev, desc->addr); > + *len = virtio32_to_cpu(vq->vq.vdev, desc->len); > + *dir = (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : > DMA_TO_DEVICE; > + > + if (++cursor->pos == cursor->num) { > + kfree(indir_desc); > +