Re: [PATCH net v3] virtio_net: Fix error unwinding of XDP initialization

2023-05-09 Thread Jason Wang


在 2023/5/9 09:43, Xuan Zhuo 写道:

On Mon, 8 May 2023 11:00:10 -0400, Feng Liu  wrote:


On 2023-05-07 p.m.9:45, Xuan Zhuo wrote:

External email: Use caution opening links or attachments


On Sat, 6 May 2023 08:08:02 -0400, Feng Liu  wrote:


On 2023-05-05 p.m.10:33, Xuan Zhuo wrote:

External email: Use caution opening links or attachments


On Tue, 2 May 2023 20:35:25 -0400, Feng Liu  wrote:

When initializing XDP in virtnet_open(), some rq xdp initialization
may hit an error causing net device open failed. However, previous
rqs have already initialized XDP and enabled NAPI, which is not the
expected behavior. Need to roll back the previous rq initialization
to avoid leaks in error unwinding of init code.

Also extract a helper function of disable queue pairs, and use newly
introduced helper function in error unwinding and virtnet_close;

Issue: 3383038
Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info")
Signed-off-by: Feng Liu 
Reviewed-by: William Tu 
Reviewed-by: Parav Pandit 
Reviewed-by: Simon Horman 
Acked-by: Michael S. Tsirkin 
Change-Id: Ib4c6a97cb7b837cfa484c593dd43a435c47ea68f
---
drivers/net/virtio_net.c | 30 --
1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8d8038538fc4..3737cf120cb7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1868,6 +1868,13 @@ static int virtnet_poll(struct napi_struct *napi, int 
budget)
 return received;
}

+static void virtnet_disable_qp(struct virtnet_info *vi, int qp_index)
+{
+ virtnet_napi_tx_disable(&vi->sq[qp_index].napi);
+ napi_disable(&vi->rq[qp_index].napi);
+ xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq);
+}
+
static int virtnet_open(struct net_device *dev)
{
 struct virtnet_info *vi = netdev_priv(dev);
@@ -1883,20 +1890,26 @@ static int virtnet_open(struct net_device *dev)

 err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, 
vi->rq[i].napi.napi_id);
 if (err < 0)
- return err;
+ goto err_xdp_info_reg;

 err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
  MEM_TYPE_PAGE_SHARED, NULL);
- if (err < 0) {
- xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
- return err;
- }
+ if (err < 0)
+ goto err_xdp_reg_mem_model;

 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 }

 return 0;
+
+err_xdp_reg_mem_model:
+ xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
+err_xdp_info_reg:
+ for (i = i - 1; i >= 0; i--)
+ virtnet_disable_qp(vi, i);


I would to know should we handle for these:

   disable_delayed_refill(vi);
   cancel_delayed_work_sync(&vi->refill);


Maybe we should call virtnet_close() with "i" directly.

Thanks.



Can’t use i directly here, because if xdp_rxq_info_reg fails, napi has
not been enabled for current qp yet, I should roll back from the queue
pairs where napi was enabled before(i--), otherwise it will hang at napi
disable api

This is not the point, the key is whether we should handle with:

disable_delayed_refill(vi);
cancel_delayed_work_sync(&vi->refill);

Thanks.



OK, get the point. Thanks for your careful review. And I check the code
again.

There are two points that I need to explain:

1. All refill delay work calls(vi->refill, vi->refill_enabled) are based
on that the virtio interface is successfully opened, such as
virtnet_receive, virtnet_rx_resize, _virtnet_set_queues, etc. If there
is an error in the xdp reg here, it will not trigger these subsequent
functions. There is no need to call disable_delayed_refill() and
cancel_delayed_work_sync().

Maybe something is wrong. I think these lines may call delay work.

static int virtnet_open(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
int i, err;

enable_delayed_refill(vi);

for (i = 0; i < vi->max_queue_pairs; i++) {
if (i < vi->curr_queue_pairs)
/* Make sure we have some buffers: if oom use wq. */
-->  if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
-->  schedule_delayed_work(&vi->refill, 0);

err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, 
vi->rq[i].napi.napi_id);
if (err < 0)
return err;

err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
 MEM_TYPE_PAGE_SHARED, NULL);
if (err < 0) {
xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
return err;
}

virtnet_napi_enable(vi->rq[i].vq, &vi-

Re: [PATCH] vdpa/snet: implement the resume vDPA callback

2023-05-11 Thread Jason Wang
On Tue, May 2, 2023 at 9:11 PM Alvaro Karsz  wrote:
>
> The callback sends a resume command to the DPU through
> the control mechanism.
>
> Signed-off-by: Alvaro Karsz 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/solidrun/snet_ctrl.c |  6 ++
>  drivers/vdpa/solidrun/snet_main.c | 15 +++
>  drivers/vdpa/solidrun/snet_vdpa.h |  1 +
>  3 files changed, 22 insertions(+)
>
> diff --git a/drivers/vdpa/solidrun/snet_ctrl.c 
> b/drivers/vdpa/solidrun/snet_ctrl.c
> index 3858738643b..3cef2571d15 100644
> --- a/drivers/vdpa/solidrun/snet_ctrl.c
> +++ b/drivers/vdpa/solidrun/snet_ctrl.c
> @@ -16,6 +16,7 @@ enum snet_ctrl_opcodes {
> SNET_CTRL_OP_DESTROY = 1,
> SNET_CTRL_OP_READ_VQ_STATE,
> SNET_CTRL_OP_SUSPEND,
> +   SNET_CTRL_OP_RESUME,
>  };
>
>  #define SNET_CTRL_TIMEOUT  200
> @@ -328,3 +329,8 @@ int snet_suspend_dev(struct snet *snet)
>  {
> return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0);
>  }
> +
> +int snet_resume_dev(struct snet *snet)
> +{
> +   return snet_send_ctrl_msg(snet, SNET_CTRL_OP_RESUME, 0);
> +}
> diff --git a/drivers/vdpa/solidrun/snet_main.c 
> b/drivers/vdpa/solidrun/snet_main.c
> index cdcd84ce4f5..99428a04068 100644
> --- a/drivers/vdpa/solidrun/snet_main.c
> +++ b/drivers/vdpa/solidrun/snet_main.c
> @@ -509,6 +509,20 @@ static int snet_suspend(struct vdpa_device *vdev)
> return ret;
>  }
>
> +static int snet_resume(struct vdpa_device *vdev)
> +{
> +   struct snet *snet = vdpa_to_snet(vdev);
> +   int ret;
> +
> +   ret = snet_resume_dev(snet);
> +   if (ret)
> +   SNET_ERR(snet->pdev, "SNET[%u] resume failed, err: %d\n", 
> snet->sid, ret);
> +   else
> +   SNET_DBG(snet->pdev, "Resume SNET[%u] device\n", snet->sid);
> +
> +   return ret;
> +}
> +
>  static const struct vdpa_config_ops snet_config_ops = {
> .set_vq_address = snet_set_vq_address,
> .set_vq_num = snet_set_vq_num,
> @@ -536,6 +550,7 @@ static const struct vdpa_config_ops snet_config_ops = {
> .get_config = snet_get_config,
> .set_config = snet_set_config,
> .suspend= snet_suspend,
> +   .resume = snet_resume,
>  };
>
>  static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
> diff --git a/drivers/vdpa/solidrun/snet_vdpa.h 
> b/drivers/vdpa/solidrun/snet_vdpa.h
> index 3c78d4e7d48..36ac285835e 100644
> --- a/drivers/vdpa/solidrun/snet_vdpa.h
> +++ b/drivers/vdpa/solidrun/snet_vdpa.h
> @@ -204,5 +204,6 @@ void snet_ctrl_clear(struct snet *snet);
>  int snet_destroy_dev(struct snet *snet);
>  int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state 
> *state);
>  int snet_suspend_dev(struct snet *snet);
> +int snet_resume_dev(struct snet *snet);
>
>  #endif //_SNET_VDPA_H_
> --
> 2.34.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] vhost_net: revert upend_idx only on retriable error

2023-05-11 Thread Jason Wang
On Tue, Apr 25, 2023 at 4:44 AM Andrey Smetanin
 wrote:
>
> Fix possible virtqueue used buffers leak and corresponding stuck
> in case of temporary -EIO from sendmsg() which is produced by
> tun driver while backend device is not up.
>
> In case of no-retriable error and zcopy do not revert upend_idx
> to pass packet data (that is update used_idx in corresponding
> vhost_zerocopy_signal_used()) as if packet data has been
> transferred successfully.
>
> v2: set vq->heads[ubuf->desc].len equal to VHOST_DMA_DONE_LEN
> in case of fake successful transmit.
>
> Signed-off-by: Andrey Smetanin 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vhost/net.c | 11 ---
>  1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 20265393aee7..0791fbdb3975 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -934,13 +934,18 @@ static void handle_tx_zerocopy(struct vhost_net *net, 
> struct socket *sock)
>
> err = sock->ops->sendmsg(sock, &msg, len);
> if (unlikely(err < 0)) {
> +   bool retry = err == -EAGAIN || err == -ENOMEM || err 
> == -ENOBUFS;
> +
> if (zcopy_used) {
> if (vq->heads[ubuf->desc].len == 
> VHOST_DMA_IN_PROGRESS)
> vhost_net_ubuf_put(ubufs);
> -   nvq->upend_idx = ((unsigned)nvq->upend_idx - 
> 1)
> -   % UIO_MAXIOV;
> +   if (retry)
> +   nvq->upend_idx = 
> ((unsigned)nvq->upend_idx - 1)
> +   % UIO_MAXIOV;
> +   else
> +   vq->heads[ubuf->desc].len = 
> VHOST_DMA_DONE_LEN;
> }
> -   if (err == -EAGAIN || err == -ENOMEM || err == 
> -ENOBUFS) {
> +   if (retry) {
> vhost_discard_vq_desc(vq, 1);
> vhost_net_enable_vq(net, vq);
> break;
> --
> 2.25.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-14 Thread Jason Wang
On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin  wrote:
>
> On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote:
> > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin  wrote:
> > >
> > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote:
> > > > Forget to cc netdev, adding.
> > > >
> > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote:
> > > > > > This patch convert rx mode setting to be done in a workqueue, this 
> > > > > > is
> > > > > > a must for allow to sleep when waiting for the cvq command to
> > > > > > response since current code is executed under addr spin lock.
> > > > > >
> > > > > > Signed-off-by: Jason Wang 
> > > > >
> > > > > I don't like this frankly. This means that setting RX mode which would
> > > > > previously be reliable, now becomes unreliable.
> > > >
> > > > It is "unreliable" by design:
> > > >
> > > >   void(*ndo_set_rx_mode)(struct net_device 
> > > > *dev);
> > > >
> > > > > - first of all configuration is no longer immediate
> > > >
> > > > Is immediate a hard requirement? I can see a workqueue is used at least:
> > > >
> > > > mlx5e, ipoib, efx, ...
> > > >
> > > > >   and there is no way for driver to find out when
> > > > >   it actually took effect
> > > >
> > > > But we know rx mode is best effort e.g it doesn't support vhost and we
> > > > survive from this for years.
> > > >
> > > > > - second, if device fails command, this is also not
> > > > >   propagated to driver, again no way for driver to find out
> > > > >
> > > > > VDUSE needs to be fixed to do tricks to fix this
> > > > > without breaking normal drivers.
> > > >
> > > > It's not specific to VDUSE. For example, when using virtio-net in the
> > > > UP environment with any software cvq (like mlx5 via vDPA or cma
> > > > transport).
> > > >
> > > > Thanks
> > >
> > > Hmm. Can we differentiate between these use-cases?
> >
> > It doesn't look easy since we are drivers for virtio bus. Underlayer
> > details were hidden from virtio-net.
> >
> > Or do you have any ideas on this?
> >
> > Thanks
>
> I don't know, pass some kind of flag in struct virtqueue?
> "bool slow; /* This vq can be very slow sometimes. Don't wait for it! 
> */"
>
> ?
>

So if it's slow, sleep, otherwise poll?

I feel setting this flag might be tricky, since the driver doesn't
know whether or not it's really slow. E.g smartNIC vendor may allow
virtio-net emulation over PCI.

Thanks

> --
> MST
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net v6] virtio_net: Fix error unwinding of XDP initialization

2023-05-14 Thread Jason Wang
On Fri, May 12, 2023 at 11:18 PM Feng Liu  wrote:
>
> When initializing XDP in virtnet_open(), some rq xdp initialization
> may hit an error causing net device open failed. However, previous
> rqs have already initialized XDP and enabled NAPI, which is not the
> expected behavior. Need to roll back the previous rq initialization
> to avoid leaks in error unwinding of init code.
>
> Also extract helper functions of disable and enable queue pairs.
> Use newly introduced disable helper function in error unwinding and
> virtnet_close. Use enable helper function in virtnet_open.
>
> Fixes: 754b8a21a96d ("virtio_net: setup xdp_rxq_info")
> Signed-off-by: Feng Liu 
> Reviewed-by: Jiri Pirko 
> Reviewed-by: William Tu 

Acked-by: Jason Wang 

Thanks

> ---
> v5 -> v6
> feedbacks from Xuan Zhuo
> - add disable_delayed_refill and cancel_delayed_work_sync
>
> v4 -> v5
> feedbacks from Michael S. Tsirkin
> - rename helper as virtnet_disable_queue_pair
> - rename helper as virtnet_enable_queue_pair
>
> v3 -> v4
> feedbacks from Jiri Pirko
> - Add symmetric helper function virtnet_enable_qp to enable queues.
> - Error handle:  cleanup current queue pair in virtnet_enable_qp,
>   and complete the reset queue pairs cleanup in virtnet_open.
> - Fix coding style.
> feedbacks from Parav Pandit
> - Remove redundant debug message and white space.
>
> v2 -> v3
> feedbacks from Michael S. Tsirkin
> - Remove redundant comment.
>
> v1 -> v2
> feedbacks from Michael S. Tsirkin
> - squash two patches together.
>
> ---
>  drivers/net/virtio_net.c | 61 +---
>  1 file changed, 44 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a12ae26db0e2..56ca1d270304 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1868,6 +1868,38 @@ static int virtnet_poll(struct napi_struct *napi, int 
> budget)
> return received;
>  }
>
> +static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index)
> +{
> +   virtnet_napi_tx_disable(&vi->sq[qp_index].napi);
> +   napi_disable(&vi->rq[qp_index].napi);
> +   xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq);
> +}
> +
> +static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> +{
> +   struct net_device *dev = vi->dev;
> +   int err;
> +
> +   err = xdp_rxq_info_reg(&vi->rq[qp_index].xdp_rxq, dev, qp_index,
> +  vi->rq[qp_index].napi.napi_id);
> +   if (err < 0)
> +   return err;
> +
> +   err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +MEM_TYPE_PAGE_SHARED, NULL);
> +   if (err < 0)
> +   goto err_xdp_reg_mem_model;
> +
> +   virtnet_napi_enable(vi->rq[qp_index].vq, &vi->rq[qp_index].napi);
> +   virtnet_napi_tx_enable(vi, vi->sq[qp_index].vq, 
> &vi->sq[qp_index].napi);
> +
> +   return 0;
> +
> +err_xdp_reg_mem_model:
> +   xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq);
> +   return err;
> +}
> +
>  static int virtnet_open(struct net_device *dev)
>  {
> struct virtnet_info *vi = netdev_priv(dev);
> @@ -1881,22 +1913,20 @@ static int virtnet_open(struct net_device *dev)
> if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> schedule_delayed_work(&vi->refill, 0);
>
> -   err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, 
> vi->rq[i].napi.napi_id);
> +   err = virtnet_enable_queue_pair(vi, i);
> if (err < 0)
> -   return err;
> -
> -   err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
> -MEM_TYPE_PAGE_SHARED, NULL);
> -   if (err < 0) {
> -   xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
> -   return err;
> -   }
> -
> -   virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
> -   virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
> +   goto err_enable_qp;
> }
>
> return 0;
> +
> +err_enable_qp:
> +   disable_delayed_refill(vi);
> +   cancel_delayed_work_sync(&vi->refill);
> +
> +   for (i--; i >= 0; i--)
> +   virtnet_disable_queue_pair(vi, i);
> +   return err;
>  }
>
>  static int virtnet_poll_tx(struct napi_

Re: [PATCH vhost v8 01/12] virtio_ring: split: separate dma codes

2023-05-14 Thread Jason Wang
On Fri, May 12, 2023 at 11:27 PM Christoph Hellwig  wrote:
>
> As said before, please don't try to do weird runtime checks based
> on the scatterlist.  What you have works for now, but there are
> plans to repalce the page + offset tuple in the scatterlist with
> just a phys_addr_t.  And with that your "clever" scheme will break
> instantly.
>

Xuan, I think we probably need to go back to your original method that
is having a dedicated flag and helper for pre mapped buffers.

Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 virtio 01/11] virtio: allow caller to override device id in vp_modern

2023-05-14 Thread Jason Wang


在 2023/5/4 02:12, Shannon Nelson 写道:

To add a bit of vendor flexibility with various virtio based devices,
allow the caller to check for a different device id.  This adds a function
pointer field to struct virtio_pci_modern_device to specify an override
device id check.  If defined by the driver, this function will be called
to check that the PCI device is the vendor's expected device, and will
return the found device id to be stored in mdev->id.device.  This allows
vendors with alternative vendor device ids to use this library on their
own device BAR.

Note: A lot of the diff in this is simply indenting the existing code
into an else block.

Signed-off-by: Shannon Nelson 



Acked-by: Jason Wang 

Thanks



---
  drivers/virtio/virtio_pci_modern_dev.c | 30 --
  include/linux/virtio_pci_modern.h  |  3 +++
  2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/drivers/virtio/virtio_pci_modern_dev.c 
b/drivers/virtio/virtio_pci_modern_dev.c
index 869cb46bef96..9b2d6614de67 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -218,21 +218,29 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
int err, common, isr, notify, device;
u32 notify_length;
u32 notify_offset;
+   int devid;
  
  	check_offsets();
  
-	/* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */

-   if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
-   return -ENODEV;
-
-   if (pci_dev->device < 0x1040) {
-   /* Transitional devices: use the PCI subsystem device id as
-* virtio device id, same as legacy driver always did.
-*/
-   mdev->id.device = pci_dev->subsystem_device;
+   if (mdev->device_id_check) {
+   devid = mdev->device_id_check(pci_dev);
+   if (devid < 0)
+   return devid;
+   mdev->id.device = devid;
} else {
-   /* Modern devices: simply use PCI device id, but start from 
0x1040. */
-   mdev->id.device = pci_dev->device - 0x1040;
+   /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. 
*/
+   if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f)
+   return -ENODEV;
+
+   if (pci_dev->device < 0x1040) {
+   /* Transitional devices: use the PCI subsystem device 
id as
+* virtio device id, same as legacy driver always did.
+*/
+   mdev->id.device = pci_dev->subsystem_device;
+   } else {
+   /* Modern devices: simply use PCI device id, but start 
from 0x1040. */
+   mdev->id.device = pci_dev->device - 0x1040;
+   }
}
mdev->id.vendor = pci_dev->subsystem_vendor;
  
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h

index c4eeb79b0139..e7b1db1dd0bb 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -38,6 +38,9 @@ struct virtio_pci_modern_device {
int modern_bars;
  
  	struct virtio_device_id id;

+
+   /* optional check for vendor virtio device, returns dev_id or -ERRNO */
+   int (*device_id_check)(struct pci_dev *pdev);
  };
  
  /*


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 virtio 02/11] virtio: allow caller to override device DMA mask in vp_modern

2023-05-14 Thread Jason Wang


在 2023/5/4 02:12, Shannon Nelson 写道:

To add a bit of vendor flexibility with various virtio based devices,
allow the caller to specify a different DMA mask.  This adds a dma_mask
field to struct virtio_pci_modern_device.  If defined by the driver,
this mask will be used in a call to dma_set_mask_and_coherent() instead
of the traditional DMA_BIT_MASK(64).  This allows limiting the DMA space
on vendor devices with address limitations.

Signed-off-by: Shannon Nelson 



Acked-by: Jason Wang 

Thanks



---
  drivers/virtio/virtio_pci_modern_dev.c | 3 ++-
  include/linux/virtio_pci_modern.h  | 3 +++
  2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_pci_modern_dev.c 
b/drivers/virtio/virtio_pci_modern_dev.c
index 9b2d6614de67..aad7d9296e77 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -268,7 +268,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev)
return -EINVAL;
}
  
-	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));

+   err = dma_set_mask_and_coherent(&pci_dev->dev,
+   mdev->dma_mask ? : DMA_BIT_MASK(64));
if (err)
err = dma_set_mask_and_coherent(&pci_dev->dev,
DMA_BIT_MASK(32));
diff --git a/include/linux/virtio_pci_modern.h 
b/include/linux/virtio_pci_modern.h
index e7b1db1dd0bb..067ac1d789bc 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -41,6 +41,9 @@ struct virtio_pci_modern_device {
  
  	/* optional check for vendor virtio device, returns dev_id or -ERRNO */

int (*device_id_check)(struct pci_dev *pdev);
+
+   /* optional mask for devices with limited DMA space */
+   u64 dma_mask;
  };
  
  /*


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 virtio 04/11] pds_vdpa: move enum from common to adminq header

2023-05-14 Thread Jason Wang


在 2023/5/4 02:12, Shannon Nelson 写道:

The pds_core_logical_qtype enum and IFNAMSIZ are not needed
in the common PDS header, only needed when working with the
adminq, so move them to the adminq header.

Note: This patch might conflict with pds_vfio patches that are
   in review, depending on which patchset gets pulled first.

Signed-off-by: Shannon Nelson 



Acked-by: Jason Wang 

Thanks



---
  include/linux/pds/pds_adminq.h | 21 +
  include/linux/pds/pds_common.h | 21 -
  2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
index 98a60ce87b92..61b0a8634e1a 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -222,6 +222,27 @@ enum pds_core_lif_type {
PDS_CORE_LIF_TYPE_DEFAULT = 0,
  };
  
+#define PDS_CORE_IFNAMSIZ		16

+
+/**
+ * enum pds_core_logical_qtype - Logical Queue Types
+ * @PDS_CORE_QTYPE_ADMINQ:Administrative Queue
+ * @PDS_CORE_QTYPE_NOTIFYQ:   Notify Queue
+ * @PDS_CORE_QTYPE_RXQ:   Receive Queue
+ * @PDS_CORE_QTYPE_TXQ:   Transmit Queue
+ * @PDS_CORE_QTYPE_EQ:Event Queue
+ * @PDS_CORE_QTYPE_MAX:   Max queue type supported
+ */
+enum pds_core_logical_qtype {
+   PDS_CORE_QTYPE_ADMINQ  = 0,
+   PDS_CORE_QTYPE_NOTIFYQ = 1,
+   PDS_CORE_QTYPE_RXQ = 2,
+   PDS_CORE_QTYPE_TXQ = 3,
+   PDS_CORE_QTYPE_EQ  = 4,
+
+   PDS_CORE_QTYPE_MAX = 16   /* don't change - used in struct size */
+};
+
  /**
   * union pds_core_lif_config - LIF configuration
   * @state:LIF state (enum pds_core_lif_state)
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 2a0d1669cfd0..435c8e8161c2 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -41,27 +41,6 @@ enum pds_core_vif_types {
  
  #define PDS_VDPA_DEV_NAME	PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_VDPA_STR
  
-#define PDS_CORE_IFNAMSIZ		16

-
-/**
- * enum pds_core_logical_qtype - Logical Queue Types
- * @PDS_CORE_QTYPE_ADMINQ:Administrative Queue
- * @PDS_CORE_QTYPE_NOTIFYQ:   Notify Queue
- * @PDS_CORE_QTYPE_RXQ:   Receive Queue
- * @PDS_CORE_QTYPE_TXQ:   Transmit Queue
- * @PDS_CORE_QTYPE_EQ:Event Queue
- * @PDS_CORE_QTYPE_MAX:   Max queue type supported
- */
-enum pds_core_logical_qtype {
-   PDS_CORE_QTYPE_ADMINQ  = 0,
-   PDS_CORE_QTYPE_NOTIFYQ = 1,
-   PDS_CORE_QTYPE_RXQ = 2,
-   PDS_CORE_QTYPE_TXQ = 3,
-   PDS_CORE_QTYPE_EQ  = 4,
-
-   PDS_CORE_QTYPE_MAX = 16   /* don't change - used in struct size */
-};
-
  int pdsc_register_notify(struct notifier_block *nb);
  void pdsc_unregister_notify(struct notifier_block *nb);
  void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 virtio 05/11] pds_vdpa: new adminq entries

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> Add new adminq definitions in support for vDPA operations.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  include/linux/pds/pds_adminq.h | 266 +
>  1 file changed, 266 insertions(+)
>
> diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
> index 61b0a8634e1a..c66ead725434 100644
> --- a/include/linux/pds/pds_adminq.h
> +++ b/include/linux/pds/pds_adminq.h
> @@ -605,6 +605,257 @@ struct pds_core_q_init_comp {
> u8 color;
>  };
>
> +/*
> + * enum pds_vdpa_cmd_opcode - vDPA Device commands
> + */
> +enum pds_vdpa_cmd_opcode {
> +   PDS_VDPA_CMD_INIT   = 48,
> +   PDS_VDPA_CMD_IDENT  = 49,
> +   PDS_VDPA_CMD_RESET  = 51,
> +   PDS_VDPA_CMD_VQ_RESET   = 52,
> +   PDS_VDPA_CMD_VQ_INIT= 53,
> +   PDS_VDPA_CMD_STATUS_UPDATE  = 54,
> +   PDS_VDPA_CMD_SET_FEATURES   = 55,
> +   PDS_VDPA_CMD_SET_ATTR   = 56,
> +   PDS_VDPA_CMD_VQ_SET_STATE   = 57,
> +   PDS_VDPA_CMD_VQ_GET_STATE   = 58,
> +};
> +
> +/**
> + * struct pds_vdpa_cmd - generic command
> + * @opcode:Opcode
> + * @vdpa_index:Index for vdpa subdevice
> + * @vf_id: VF id
> + */
> +struct pds_vdpa_cmd {
> +   u8 opcode;
> +   u8 vdpa_index;
> +   __le16 vf_id;
> +};
> +
> +/**
> + * struct pds_vdpa_init_cmd - INIT command
> + * @opcode:Opcode PDS_VDPA_CMD_INIT
> + * @vdpa_index: Index for vdpa subdevice
> + * @vf_id: VF id
> + */
> +struct pds_vdpa_init_cmd {
> +   u8 opcode;
> +   u8 vdpa_index;
> +   __le16 vf_id;
> +};
> +
> +/**
> + * struct pds_vdpa_ident - vDPA identification data
> + * @hw_features:   vDPA features supported by device
> + * @max_vqs:   max queues available (2 queues for a single queuepair)
> + * @max_qlen:  log(2) of maximum number of descriptors
> + * @min_qlen:  log(2) of minimum number of descriptors
> + *
> + * This struct is used in a DMA block that is set up for the 
> PDS_VDPA_CMD_IDENT
> + * transaction.  Set up the DMA block and send the address in the IDENT cmd
> + * data, the DSC will write the ident information, then we can remove the DMA
> + * block after reading the answer.  If the completion status is 0, then there
> + * is valid information, else there was an error and the data should be 
> invalid.
> + */
> +struct pds_vdpa_ident {
> +   __le64 hw_features;
> +   __le16 max_vqs;
> +   __le16 max_qlen;
> +   __le16 min_qlen;
> +};
> +
> +/**
> + * struct pds_vdpa_ident_cmd - IDENT command
> + * @opcode:Opcode PDS_VDPA_CMD_IDENT
> + * @rsvd:   Word boundary padding
> + * @vf_id: VF id
> + * @len:   length of ident info DMA space
> + * @ident_pa:  address for DMA of ident info (struct pds_vdpa_ident)
> + * only used for this transaction, then forgotten by DSC
> + */
> +struct pds_vdpa_ident_cmd {
> +   u8 opcode;
> +   u8 rsvd;
> +   __le16 vf_id;
> +   __le32 len;
> +   __le64 ident_pa;
> +};
> +
> +/**
> + * struct pds_vdpa_status_cmd - STATUS_UPDATE command
> + * @opcode:Opcode PDS_VDPA_CMD_STATUS_UPDATE
> + * @vdpa_index: Index for vdpa subdevice
> + * @vf_id: VF id
> + * @status:new status bits
> + */
> +struct pds_vdpa_status_cmd {
> +   u8 opcode;
> +   u8 vdpa_index;
> +   __le16 vf_id;
> +   u8 status;
> +};
> +
> +/**
> + * enum pds_vdpa_attr - List of VDPA device attributes
> + * @PDS_VDPA_ATTR_MAC:  MAC address
> + * @PDS_VDPA_ATTR_MAX_VQ_PAIRS: Max virtqueue pairs
> + */
> +enum pds_vdpa_attr {
> +   PDS_VDPA_ATTR_MAC  = 1,
> +   PDS_VDPA_ATTR_MAX_VQ_PAIRS = 2,
> +};
> +
> +/**
> + * struct pds_vdpa_setattr_cmd - SET_ATTR command
> + * @opcode:Opcode PDS_VDPA_CMD_SET_ATTR
> + * @vdpa_index:Index for vdpa subdevice
> + * @vf_id: VF id
> + * @attr:  attribute to be changed (enum pds_vdpa_attr)
> + * @pad:   Word boundary padding
> + * @mac:   new mac address to be assigned as vdpa device address
> + * @max_vq_pairs:  new limit of virtqueue pairs
> + */
> +struct pds_vdpa_setattr_cmd {
> +   u8 opcode;
> +   u8 vdpa_index;
> +   __le16 vf_id;
> +   u8 attr;
> +   u8 pad[3];
> +   union {
> +   u8 mac[6];
> +   __le16 max_vq_pairs;
> +   } __packe

Re: [PATCH v5 virtio 07/11] pds_vdpa: virtio bar setup for vdpa

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> Prep and use the "modern" virtio bar utilities to get our
> virtio config space ready.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/pds/aux_drv.c | 25 +
>  drivers/vdpa/pds/aux_drv.h |  3 +++
>  2 files changed, 28 insertions(+)
>
> diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c
> index aa748cf55d2b..0c4a135b1484 100644
> --- a/drivers/vdpa/pds/aux_drv.c
> +++ b/drivers/vdpa/pds/aux_drv.c
> @@ -4,6 +4,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -19,12 +20,22 @@ static const struct auxiliary_device_id 
> pds_vdpa_id_table[] = {
> {},
>  };
>
> +static int pds_vdpa_device_id_check(struct pci_dev *pdev)
> +{
> +   if (pdev->device != PCI_DEVICE_ID_PENSANDO_VDPA_VF ||
> +   pdev->vendor != PCI_VENDOR_ID_PENSANDO)
> +   return -ENODEV;
> +
> +   return PCI_DEVICE_ID_PENSANDO_VDPA_VF;
> +}
> +
>  static int pds_vdpa_probe(struct auxiliary_device *aux_dev,
>   const struct auxiliary_device_id *id)
>
>  {
> struct pds_auxiliary_dev *padev =
> container_of(aux_dev, struct pds_auxiliary_dev, aux_dev);
> +   struct device *dev = &aux_dev->dev;
> struct pds_vdpa_aux *vdpa_aux;
> int err;
>
> @@ -41,8 +52,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev,
> if (err)
> goto err_free_mem;
>
> +   /* Find the virtio configuration */
> +   vdpa_aux->vd_mdev.pci_dev = padev->vf_pdev;
> +   vdpa_aux->vd_mdev.device_id_check = pds_vdpa_device_id_check;
> +   vdpa_aux->vd_mdev.dma_mask = DMA_BIT_MASK(PDS_CORE_ADDR_LEN);
> +   err = vp_modern_probe(&vdpa_aux->vd_mdev);
> +   if (err) {
> +   dev_err(dev, "Unable to probe for virtio configuration: 
> %pe\n",
> +   ERR_PTR(err));
> +   goto err_free_mgmt_info;
> +   }
> +
> return 0;
>
> +err_free_mgmt_info:
> +   pci_free_irq_vectors(padev->vf_pdev);
>  err_free_mem:
> kfree(vdpa_aux);
> auxiliary_set_drvdata(aux_dev, NULL);
> @@ -55,6 +79,7 @@ static void pds_vdpa_remove(struct auxiliary_device 
> *aux_dev)
> struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev);
> struct device *dev = &aux_dev->dev;
>
> +   vp_modern_remove(&vdpa_aux->vd_mdev);
> pci_free_irq_vectors(vdpa_aux->padev->vf_pdev);
>
> kfree(vdpa_aux);
> diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h
> index dcec782e79eb..99e0ff340bfa 100644
> --- a/drivers/vdpa/pds/aux_drv.h
> +++ b/drivers/vdpa/pds/aux_drv.h
> @@ -4,6 +4,8 @@
>  #ifndef _AUX_DRV_H_
>  #define _AUX_DRV_H_
>
> +#include 
> +
>  #define PDS_VDPA_DRV_DESCRIPTION"AMD/Pensando vDPA VF Device Driver"
>  #define PDS_VDPA_DRV_NAME   KBUILD_MODNAME
>
> @@ -16,6 +18,7 @@ struct pds_vdpa_aux {
>
> int vf_id;
> struct dentry *dentry;
> +   struct virtio_pci_modern_device vd_mdev;
>
> int nintrs;
>  };
> --
> 2.17.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 virtio 10/11] pds_vdpa: subscribe to the pds_core events

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> Register for the pds_core's notification events, primarily to
> find out when the FW has been reset so we can pass this on
> back up the chain.
>
> Signed-off-by: Shannon Nelson 
> ---
>  drivers/vdpa/pds/vdpa_dev.c | 68 -
>  drivers/vdpa/pds/vdpa_dev.h |  1 +
>  2 files changed, 68 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
> index 9970657cdb3d..377eefc2fa1e 100644
> --- a/drivers/vdpa/pds/vdpa_dev.c
> +++ b/drivers/vdpa/pds/vdpa_dev.c
> @@ -21,6 +21,61 @@ static struct pds_vdpa_device *vdpa_to_pdsv(struct 
> vdpa_device *vdpa_dev)
> return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev);
>  }
>
> +static int pds_vdpa_notify_handler(struct notifier_block *nb,
> +  unsigned long ecode,
> +  void *data)
> +{
> +   struct pds_vdpa_device *pdsv = container_of(nb, struct 
> pds_vdpa_device, nb);
> +   struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
> +
> +   dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
> +
> +   /* Give the upper layers a hint that something interesting
> +* may have happened.  It seems that the only thing this
> +* triggers in the virtio-net drivers above us is a check
> +* of link status.
> +*
> +* We don't set the NEEDS_RESET flag for EVENT_RESET
> +* because we're likely going through a recovery or
> +* fw_update and will be back up and running soon.
> +*/
> +   if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) {

The code here seems to conflict with the comment above. If we don't
set NEEDS_RESET, there's no need for the config callback?

Thanks

> +   if (pdsv->config_cb.callback)
> +   pdsv->config_cb.callback(pdsv->config_cb.private);
> +   }
> +
> +   return 0;
> +}
> +
> +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv)
> +{
> +   struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
> +   struct notifier_block *nb = &pdsv->nb;
> +   int err;
> +
> +   if (!nb->notifier_call) {
> +   nb->notifier_call = pds_vdpa_notify_handler;
> +   err = pdsc_register_notify(nb);
> +   if (err) {
> +   nb->notifier_call = NULL;
> +   dev_err(dev, "failed to register pds event handler: 
> %ps\n",
> +   ERR_PTR(err));
> +   return -EINVAL;
> +   }
> +   dev_dbg(dev, "pds event handler registered\n");
> +   }
> +
> +   return 0;
> +}
> +
> +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv)
> +{
> +   if (pdsv->nb.notifier_call) {
> +   pdsc_unregister_notify(&pdsv->nb);
> +   pdsv->nb.notifier_call = NULL;
> +   }
> +}
> +
>  static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
>u64 desc_addr, u64 driver_addr, u64 
> device_addr)
>  {
> @@ -522,6 +577,12 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
>
> pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev;
>
> +   err = pds_vdpa_register_event_handler(pdsv);
> +   if (err) {
> +   dev_err(dev, "Failed to register for PDS events: %pe\n", 
> ERR_PTR(err));
> +   goto err_unmap;
> +   }
> +
> /* We use the _vdpa_register_device() call rather than the
>  * vdpa_register_device() to avoid a deadlock because our
>  * dev_add() is called with the vdpa_dev_lock already set
> @@ -530,13 +591,15 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
> err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs);
> if (err) {
> dev_err(dev, "Failed to register to vDPA bus: %pe\n", 
> ERR_PTR(err));
> -   goto err_unmap;
> +   goto err_unevent;
> }
>
> pds_vdpa_debugfs_add_vdpadev(vdpa_aux);
>
> return 0;
>
> +err_unevent:
> +   pds_vdpa_unregister_event_handler(pdsv);
>  err_unmap:
> put_device(&pdsv->vdpa_dev.dev);
> vdpa_aux->pdsv = NULL;
> @@ -546,8 +609,11 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
>  static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev,
>  struct vdpa_device *vdpa_dev)
>  {
> +   struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
> struct pds_vdpa_aux *vdpa_aux;
>
> +   pds_vdpa_unregister_event_handler(pdsv);
> +
> vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev);
> _vdpa_unregister_device(vdpa_dev);
>
> diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h
> index a21596f438c1..1650a2b08845 100644
> -

Re: [PATCH v5 virtio 08/11] pds_vdpa: add vdpa config client commands

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> These are the adminq commands that will be needed for
> setting up and using the vDPA device.  There are a number
> of commands defined in the FW's API, but by making use of
> the FW's virtio BAR we only need a few of these commands
> for vDPA support.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/pds/Makefile   |   1 +
>  drivers/vdpa/pds/cmds.c | 207 
>  drivers/vdpa/pds/cmds.h |  20 
>  drivers/vdpa/pds/vdpa_dev.h |  33 +-
>  4 files changed, 260 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/vdpa/pds/cmds.c
>  create mode 100644 drivers/vdpa/pds/cmds.h
>
> diff --git a/drivers/vdpa/pds/Makefile b/drivers/vdpa/pds/Makefile
> index 13b50394ec64..2e22418e3ab3 100644
> --- a/drivers/vdpa/pds/Makefile
> +++ b/drivers/vdpa/pds/Makefile
> @@ -4,6 +4,7 @@
>  obj-$(CONFIG_PDS_VDPA) := pds_vdpa.o
>
>  pds_vdpa-y := aux_drv.o \
> + cmds.o \
>   vdpa_dev.o
>
>  pds_vdpa-$(CONFIG_DEBUG_FS) += debugfs.o
> diff --git a/drivers/vdpa/pds/cmds.c b/drivers/vdpa/pds/cmds.c
> new file mode 100644
> index ..405711a0a0f8
> --- /dev/null
> +++ b/drivers/vdpa/pds/cmds.c
> @@ -0,0 +1,207 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2023 Advanced Micro Devices, Inc */
> +
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "vdpa_dev.h"
> +#include "aux_drv.h"
> +#include "cmds.h"
> +
> +int pds_vdpa_init_hw(struct pds_vdpa_device *pdsv)
> +{
> +   struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
> +   struct device *dev = &padev->aux_dev.dev;
> +   union pds_core_adminq_cmd cmd = {
> +   .vdpa_init.opcode = PDS_VDPA_CMD_INIT,
> +   .vdpa_init.vdpa_index = pdsv->vdpa_index,
> +   .vdpa_init.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
> +   };
> +   union pds_core_adminq_comp comp = {};
> +   int err;
> +
> +   /* Initialize the vdpa/virtio device */
> +   err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_init),
> +   &comp, 0);
> +   if (err)
> +   dev_dbg(dev, "Failed to init hw, status %d: %pe\n",
> +   comp.status, ERR_PTR(err));
> +
> +   return err;
> +}
> +
> +int pds_vdpa_cmd_reset(struct pds_vdpa_device *pdsv)
> +{
> +   struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
> +   struct device *dev = &padev->aux_dev.dev;
> +   union pds_core_adminq_cmd cmd = {
> +   .vdpa.opcode = PDS_VDPA_CMD_RESET,
> +   .vdpa.vdpa_index = pdsv->vdpa_index,
> +   .vdpa.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
> +   };
> +   union pds_core_adminq_comp comp = {};
> +   int err;
> +
> +   err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa), &comp, 0);
> +   if (err)
> +   dev_dbg(dev, "Failed to reset hw, status %d: %pe\n",
> +   comp.status, ERR_PTR(err));
> +
> +   return err;
> +}
> +
> +int pds_vdpa_cmd_set_mac(struct pds_vdpa_device *pdsv, u8 *mac)
> +{
> +   struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
> +   struct device *dev = &padev->aux_dev.dev;
> +   union pds_core_adminq_cmd cmd = {
> +   .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR,
> +   .vdpa_setattr.vdpa_index = pdsv->vdpa_index,
> +   .vdpa_setattr.vf_id = cpu_to_le16(pdsv->vdpa_aux->vf_id),
> +   .vdpa_setattr.attr = PDS_VDPA_ATTR_MAC,
> +   };
> +   union pds_core_adminq_comp comp = {};
> +   int err;
> +
> +   ether_addr_copy(cmd.vdpa_setattr.mac, mac);
> +   err = pds_client_adminq_cmd(padev, &cmd, sizeof(cmd.vdpa_setattr),
> +   &comp, 0);
> +   if (err)
> +   dev_dbg(dev, "Failed to set mac address %pM, status %d: 
> %pe\n",
> +   mac, comp.status, ERR_PTR(err));
> +
> +   return err;
> +}
> +
> +int pds_vdpa_cmd_set_max_vq_pairs(struct pds_vdpa_device *pdsv, u16 max_vqp)
> +{
> +   struct pds_auxiliary_dev *padev = pdsv->vdpa_aux->padev;
> +   struct device *dev = &padev->aux_dev.dev;
> +   union pds_core_adminq_cmd cmd = {
> +   .vdpa_setattr.opcode = PDS_VDPA_CMD_SET_ATTR,
> +   .vdpa_setattr.vdpa_index =

Re: [PATCH v5 virtio 09/11] pds_vdpa: add support for vdpa and vdpamgmt interfaces

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> This is the vDPA device support, where we advertise that we can
> support the virtio queues and deal with the configuration work
> through the pds_core's adminq.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/pds/aux_drv.c  |  15 +
>  drivers/vdpa/pds/aux_drv.h  |   1 +
>  drivers/vdpa/pds/debugfs.c  | 261 ++
>  drivers/vdpa/pds/debugfs.h  |   5 +
>  drivers/vdpa/pds/vdpa_dev.c | 532 +++-
>  5 files changed, 813 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c
> index 0c4a135b1484..186e9ee22eb1 100644
> --- a/drivers/vdpa/pds/aux_drv.c
> +++ b/drivers/vdpa/pds/aux_drv.c
> @@ -63,8 +63,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev,
> goto err_free_mgmt_info;
> }
>
> +   /* Let vdpa know that we can provide devices */
> +   err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev);
> +   if (err) {
> +   dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: 
> %pe\n",
> +   __func__, ERR_PTR(err));
> +   goto err_free_virtio;
> +   }
> +
> +   pds_vdpa_debugfs_add_pcidev(vdpa_aux);
> +   pds_vdpa_debugfs_add_ident(vdpa_aux);
> +
> return 0;
>
> +err_free_virtio:
> +   vp_modern_remove(&vdpa_aux->vd_mdev);
>  err_free_mgmt_info:
> pci_free_irq_vectors(padev->vf_pdev);
>  err_free_mem:
> @@ -79,9 +92,11 @@ static void pds_vdpa_remove(struct auxiliary_device 
> *aux_dev)
> struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev);
> struct device *dev = &aux_dev->dev;
>
> +   vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev);
> vp_modern_remove(&vdpa_aux->vd_mdev);
> pci_free_irq_vectors(vdpa_aux->padev->vf_pdev);
>
> +   pds_vdpa_debugfs_del_vdpadev(vdpa_aux);
> kfree(vdpa_aux);
> auxiliary_set_drvdata(aux_dev, NULL);
>
> diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h
> index 99e0ff340bfa..26b75344156e 100644
> --- a/drivers/vdpa/pds/aux_drv.h
> +++ b/drivers/vdpa/pds/aux_drv.h
> @@ -13,6 +13,7 @@ struct pds_vdpa_aux {
> struct pds_auxiliary_dev *padev;
>
> struct vdpa_mgmt_dev vdpa_mdev;
> +   struct pds_vdpa_device *pdsv;
>
> struct pds_vdpa_ident ident;
>
> diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c
> index d91dceb07380..0ecd0e2ec6b9 100644
> --- a/drivers/vdpa/pds/debugfs.c
> +++ b/drivers/vdpa/pds/debugfs.c
> @@ -10,6 +10,7 @@
>  #include 
>
>  #include "aux_drv.h"
> +#include "vdpa_dev.h"
>  #include "debugfs.h"
>
>  static struct dentry *dbfs_dir;
> @@ -24,3 +25,263 @@ void pds_vdpa_debugfs_destroy(void)
> debugfs_remove_recursive(dbfs_dir);
> dbfs_dir = NULL;
>  }
> +
> +#define PRINT_SBIT_NAME(__seq, __f, __name) \
> +   do {\
> +   if ((__f) & (__name))   \
> +   seq_printf(__seq, " %s", &#__name[16]); \
> +   } while (0)
> +
> +static void print_status_bits(struct seq_file *seq, u8 status)
> +{
> +   seq_puts(seq, "status:");
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED);
> +   seq_puts(seq, "\n");
> +}
> +
> +static void print_feature_bits_all(struct seq_file *seq, u64 features)
> +{
> +   int i;
> +
> +   seq_puts(seq, "features:");
> +
> +   for (i = 0; i < (sizeof(u64) * 8); i++) {
> +   u64 mask = BIT_ULL(i);
> +
> +   switch (features & mask) {
> +   case BIT_ULL(VIRTIO_NET_F_CSUM):
> +   seq_puts(seq, " VIRTIO_NET_F_CSUM");
> +   break;
> +   case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM):
> +   seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM");
> +   break;
> +   case BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS):
> +   seq_puts(seq, " VIRTIO_NET_F_CTRL_GUES

Re: [PATCH v5 virtio 11/11] pds_vdpa: pds_vdps.rst and Kconfig

2023-05-14 Thread Jason Wang
On Thu, May 4, 2023 at 2:13 AM Shannon Nelson  wrote:
>
> Add the documentation and Kconfig entry for pds_vdpa driver.
>
> Signed-off-by: Shannon Nelson 
> ---
>  .../device_drivers/ethernet/amd/pds_vdpa.rst  | 85 +++
>  .../device_drivers/ethernet/index.rst |  1 +
>  MAINTAINERS   |  4 +
>  drivers/vdpa/Kconfig  |  8 ++
>  4 files changed, 98 insertions(+)
>  create mode 100644 
> Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
>
> diff --git 
> a/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst 
> b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
> new file mode 100644
> index ..587927d3de92
> --- /dev/null
> +++ b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
> @@ -0,0 +1,85 @@
> +.. SPDX-License-Identifier: GPL-2.0+
> +.. note: can be edited and viewed with /usr/bin/formiko-vim
> +
> +==
> +PCI vDPA driver for the AMD/Pensando(R) DSC adapter family
> +==
> +
> +AMD/Pensando vDPA VF Device Driver
> +
> +Copyright(c) 2023 Advanced Micro Devices, Inc
> +
> +Overview
> +
> +
> +The ``pds_vdpa`` driver is an auxiliary bus driver that supplies
> +a vDPA device for use by the virtio network stack.  It is used with
> +the Pensando Virtual Function devices that offer vDPA and virtio queue
> +services.  It depends on the ``pds_core`` driver and hardware for the PF
> +and VF PCI handling as well as for device configuration services.
> +
> +Using the device
> +
> +
> +The ``pds_vdpa`` device is enabled via multiple configuration steps and
> +depends on the ``pds_core`` driver to create and enable SR-IOV Virtual
> +Function devices.  After the VFs are enabled, we enable the vDPA service
> +in the ``pds_core`` device to create the auxiliary devices used by pds_vdpa.
> +
> +Example steps:
> +
> +.. code-block:: bash
> +
> +  #!/bin/bash
> +
> +  modprobe pds_core
> +  modprobe vdpa
> +  modprobe pds_vdpa
> +
> +  PF_BDF=`ls /sys/module/pds_core/drivers/pci\:pds_core/*/sriov_numvfs | awk 
> -F / '{print $7}'`
> +
> +  # Enable vDPA VF auxiliary device(s) in the PF
> +  devlink dev param set pci/$PF_BDF name enable_vnet cmode runtime value true
> +
> +  # Create a VF for vDPA use
> +  echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs
> +
> +  # Find the vDPA services/devices available
> +  PDS_VDPA_MGMT=`vdpa mgmtdev show | grep vDPA | head -1 | cut -d: -f1`
> +
> +  # Create a vDPA device for use in virtio network configurations
> +  vdpa dev add name vdpa1 mgmtdev $PDS_VDPA_MGMT mac 00:11:22:33:44:55
> +
> +  # Set up an ethernet interface on the vdpa device
> +  modprobe virtio_vdpa
> +
> +
> +
> +Enabling the driver
> +===
> +
> +The driver is enabled via the standard kernel configuration system,
> +using the make command::
> +
> +  make oldconfig/menuconfig/etc.
> +
> +The driver is located in the menu structure at:
> +
> +  -> Device Drivers
> +-> Network device support (NETDEVICES [=y])
> +  -> Ethernet driver support
> +-> Pensando devices
> +  -> Pensando Ethernet PDS_VDPA Support
> +
> +Support
> +===
> +
> +For general Linux networking support, please use the netdev mailing
> +list, which is monitored by Pensando personnel::
> +
> +  net...@vger.kernel.org
> +
> +For more specific support needs, please use the Pensando driver support
> +email::
> +
> +  driv...@pensando.io
> diff --git a/Documentation/networking/device_drivers/ethernet/index.rst 
> b/Documentation/networking/device_drivers/ethernet/index.rst
> index 417ca514a4d0..94ecb67c0885 100644
> --- a/Documentation/networking/device_drivers/ethernet/index.rst
> +++ b/Documentation/networking/device_drivers/ethernet/index.rst
> @@ -15,6 +15,7 @@ Contents:
> amazon/ena
> altera/altera_tse
> amd/pds_core
> +   amd/pds_vdpa
> aquantia/atlantic
> chelsio/cxgb
> cirrus/cs89x0
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ebd26b3ca90e..c565b71ce56f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -22200,6 +22200,10 @@ SNET DPU VIRTIO DATA PATH ACCELERATOR
>  R: Alvaro Karsz 
>  F: drivers/vdpa/solidrun/
>
> +PDS DSC VIRTIO DATA PATH ACCELERATOR
> +R: Shannon Nelson 
> +F: drivers/vdpa/pds/
> +
>  VIRTIO BALLOON
>  M: "Michael S. Tsirkin" 
>  M: David Hildenbrand 
> diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
> index cd6ad92f3f05..2ee1b288691d 100644
> --- a/drivers/vdpa/Kconfig
> +++ b/drivers/vdpa/Kconfig
> @@ -116,4 +116,12 @@ config ALIBABA_ENI_VDPA
>   This driver includes a HW monitor device that
>   reads health values from the DPU.
>
> +config PDS_VDPA
> +   tristate "vDPA driver for AMD/Pensando DSC devices"
> +   depends on PDS_CORE

Need to select VIRTIO_PCI_LIB?

Thanks

> +   help
> + vDPA network 

Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-14 Thread Jason Wang
On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin  wrote:
>
> On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote:
> > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin  wrote:
> > >
> > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote:
> > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote:
> > > > > > Forget to cc netdev, adding.
> > > > > >
> > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote:
> > > > > > > > This patch convert rx mode setting to be done in a workqueue, 
> > > > > > > > this is
> > > > > > > > a must for allow to sleep when waiting for the cvq command to
> > > > > > > > response since current code is executed under addr spin lock.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang 
> > > > > > >
> > > > > > > I don't like this frankly. This means that setting RX mode which 
> > > > > > > would
> > > > > > > previously be reliable, now becomes unreliable.
> > > > > >
> > > > > > It is "unreliable" by design:
> > > > > >
> > > > > >   void(*ndo_set_rx_mode)(struct net_device 
> > > > > > *dev);
> > > > > >
> > > > > > > - first of all configuration is no longer immediate
> > > > > >
> > > > > > Is immediate a hard requirement? I can see a workqueue is used at 
> > > > > > least:
> > > > > >
> > > > > > mlx5e, ipoib, efx, ...
> > > > > >
> > > > > > >   and there is no way for driver to find out when
> > > > > > >   it actually took effect
> > > > > >
> > > > > > But we know rx mode is best effort e.g it doesn't support vhost and 
> > > > > > we
> > > > > > survive from this for years.
> > > > > >
> > > > > > > - second, if device fails command, this is also not
> > > > > > >   propagated to driver, again no way for driver to find out
> > > > > > >
> > > > > > > VDUSE needs to be fixed to do tricks to fix this
> > > > > > > without breaking normal drivers.
> > > > > >
> > > > > > It's not specific to VDUSE. For example, when using virtio-net in 
> > > > > > the
> > > > > > UP environment with any software cvq (like mlx5 via vDPA or cma
> > > > > > transport).
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Hmm. Can we differentiate between these use-cases?
> > > >
> > > > It doesn't look easy since we are drivers for virtio bus. Underlayer
> > > > details were hidden from virtio-net.
> > > >
> > > > Or do you have any ideas on this?
> > > >
> > > > Thanks
> > >
> > > I don't know, pass some kind of flag in struct virtqueue?
> > > "bool slow; /* This vq can be very slow sometimes. Don't wait for 
> > > it! */"
> > >
> > > ?
> > >
> >
> > So if it's slow, sleep, otherwise poll?
> >
> > I feel setting this flag might be tricky, since the driver doesn't
> > know whether or not it's really slow. E.g smartNIC vendor may allow
> > virtio-net emulation over PCI.
> >
> > Thanks
>
> driver will have the choice, depending on whether
> vq is deterministic or not.

Ok, but the problem is, such booleans are only useful for virtio ring
codes. But in this case, virtio-net knows what to do for cvq. So I'm
not sure who the user is.

Thanks

>
>
> > > --
> > > MST
> > >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-15 Thread Jason Wang
On Mon, May 15, 2023 at 6:17 PM Michael S. Tsirkin  wrote:
>
> On Mon, May 15, 2023 at 01:13:33PM +0800, Jason Wang wrote:
> > On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin  wrote:
> > >
> > > On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote:
> > > > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote:
> > > > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote:
> > > > > > > > Forget to cc netdev, adding.
> > > > > > > >
> > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin 
> > > > > > > >  wrote:
> > > > > > > > >
> > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang wrote:
> > > > > > > > > > This patch convert rx mode setting to be done in a 
> > > > > > > > > > workqueue, this is
> > > > > > > > > > a must for allow to sleep when waiting for the cvq command 
> > > > > > > > > > to
> > > > > > > > > > response since current code is executed under addr spin 
> > > > > > > > > > lock.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Jason Wang 
> > > > > > > > >
> > > > > > > > > I don't like this frankly. This means that setting RX mode 
> > > > > > > > > which would
> > > > > > > > > previously be reliable, now becomes unreliable.
> > > > > > > >
> > > > > > > > It is "unreliable" by design:
> > > > > > > >
> > > > > > > >   void(*ndo_set_rx_mode)(struct 
> > > > > > > > net_device *dev);
> > > > > > > >
> > > > > > > > > - first of all configuration is no longer immediate
> > > > > > > >
> > > > > > > > Is immediate a hard requirement? I can see a workqueue is used 
> > > > > > > > at least:
> > > > > > > >
> > > > > > > > mlx5e, ipoib, efx, ...
> > > > > > > >
> > > > > > > > >   and there is no way for driver to find out when
> > > > > > > > >   it actually took effect
> > > > > > > >
> > > > > > > > But we know rx mode is best effort e.g it doesn't support vhost 
> > > > > > > > and we
> > > > > > > > survive from this for years.
> > > > > > > >
> > > > > > > > > - second, if device fails command, this is also not
> > > > > > > > >   propagated to driver, again no way for driver to find out
> > > > > > > > >
> > > > > > > > > VDUSE needs to be fixed to do tricks to fix this
> > > > > > > > > without breaking normal drivers.
> > > > > > > >
> > > > > > > > It's not specific to VDUSE. For example, when using virtio-net 
> > > > > > > > in the
> > > > > > > > UP environment with any software cvq (like mlx5 via vDPA or cma
> > > > > > > > transport).
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > > Hmm. Can we differentiate between these use-cases?
> > > > > >
> > > > > > It doesn't look easy since we are drivers for virtio bus. Underlayer
> > > > > > details were hidden from virtio-net.
> > > > > >
> > > > > > Or do you have any ideas on this?
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > I don't know, pass some kind of flag in struct virtqueue?
> > > > > "bool slow; /* This vq can be very slow sometimes. Don't wait 
> > > > > for it! */"
> > > > >
> > > > > ?
> > > > >
> > > >
> > > > So if it's slow, sleep, otherwise poll?
> > > >
> > > > I feel setting this flag might be tricky, since the driver doesn't
> > > > know whether or not it's really slow. E.g smartNIC vendor may allow
> > > > virtio-net emulation over PCI.
> > > >
> > > > Thanks
> > >
> > > driver will have the choice, depending on whether
> > > vq is deterministic or not.
> >
> > Ok, but the problem is, such booleans are only useful for virtio ring
> > codes. But in this case, virtio-net knows what to do for cvq. So I'm
> > not sure who the user is.
> >
> > Thanks
>
> Circling back, what exactly does the architecture you are trying
> to fix look like? Who is going to introduce unbounded latency?
> The hypervisor?

Hypervisor is one of the possible reason, we have many more:

Hardware device that provides virtio-pci emulation.
Userspace devices like VDUSE.

> If so do we not maybe want a new feature bit
> that documents this? Hypervisor then can detect old guests
> that spin and decide what to do, e.g. prioritise cvq more,
> or fail FEATURES_OK.

We suffer from this for bare metal as well.

But a question is what's wrong with the approach that is used in this
patch? I've answered that set_rx_mode is not reliable, so it should be
fine to use workqueue. Except for this, any other thing that worries
you?

Thanks

>
> > >
> > >
> > > > > --
> > > > > MST
> > > > >
> > >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v6 virtio 11/11] pds_vdpa: pds_vdps.rst and Kconfig

2023-05-15 Thread Jason Wang
On Tue, May 16, 2023 at 10:56 AM Shannon Nelson  wrote:
>
> Add the documentation and Kconfig entry for pds_vdpa driver.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  .../device_drivers/ethernet/amd/pds_vdpa.rst  | 85 +++
>  .../device_drivers/ethernet/index.rst |  1 +
>  MAINTAINERS   |  4 +
>  drivers/vdpa/Kconfig  | 10 +++
>  4 files changed, 100 insertions(+)
>  create mode 100644 
> Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
>
> diff --git 
> a/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst 
> b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
> new file mode 100644
> index ..587927d3de92
> --- /dev/null
> +++ b/Documentation/networking/device_drivers/ethernet/amd/pds_vdpa.rst
> @@ -0,0 +1,85 @@
> +.. SPDX-License-Identifier: GPL-2.0+
> +.. note: can be edited and viewed with /usr/bin/formiko-vim
> +
> +==
> +PCI vDPA driver for the AMD/Pensando(R) DSC adapter family
> +==
> +
> +AMD/Pensando vDPA VF Device Driver
> +
> +Copyright(c) 2023 Advanced Micro Devices, Inc
> +
> +Overview
> +
> +
> +The ``pds_vdpa`` driver is an auxiliary bus driver that supplies
> +a vDPA device for use by the virtio network stack.  It is used with
> +the Pensando Virtual Function devices that offer vDPA and virtio queue
> +services.  It depends on the ``pds_core`` driver and hardware for the PF
> +and VF PCI handling as well as for device configuration services.
> +
> +Using the device
> +
> +
> +The ``pds_vdpa`` device is enabled via multiple configuration steps and
> +depends on the ``pds_core`` driver to create and enable SR-IOV Virtual
> +Function devices.  After the VFs are enabled, we enable the vDPA service
> +in the ``pds_core`` device to create the auxiliary devices used by pds_vdpa.
> +
> +Example steps:
> +
> +.. code-block:: bash
> +
> +  #!/bin/bash
> +
> +  modprobe pds_core
> +  modprobe vdpa
> +  modprobe pds_vdpa
> +
> +  PF_BDF=`ls /sys/module/pds_core/drivers/pci\:pds_core/*/sriov_numvfs | awk 
> -F / '{print $7}'`
> +
> +  # Enable vDPA VF auxiliary device(s) in the PF
> +  devlink dev param set pci/$PF_BDF name enable_vnet cmode runtime value true
> +
> +  # Create a VF for vDPA use
> +  echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs
> +
> +  # Find the vDPA services/devices available
> +  PDS_VDPA_MGMT=`vdpa mgmtdev show | grep vDPA | head -1 | cut -d: -f1`
> +
> +  # Create a vDPA device for use in virtio network configurations
> +  vdpa dev add name vdpa1 mgmtdev $PDS_VDPA_MGMT mac 00:11:22:33:44:55
> +
> +  # Set up an ethernet interface on the vdpa device
> +  modprobe virtio_vdpa
> +
> +
> +
> +Enabling the driver
> +===
> +
> +The driver is enabled via the standard kernel configuration system,
> +using the make command::
> +
> +  make oldconfig/menuconfig/etc.
> +
> +The driver is located in the menu structure at:
> +
> +  -> Device Drivers
> +-> Network device support (NETDEVICES [=y])
> +  -> Ethernet driver support
> +-> Pensando devices
> +  -> Pensando Ethernet PDS_VDPA Support
> +
> +Support
> +===
> +
> +For general Linux networking support, please use the netdev mailing
> +list, which is monitored by Pensando personnel::
> +
> +  net...@vger.kernel.org
> +
> +For more specific support needs, please use the Pensando driver support
> +email::
> +
> +  driv...@pensando.io
> diff --git a/Documentation/networking/device_drivers/ethernet/index.rst 
> b/Documentation/networking/device_drivers/ethernet/index.rst
> index 417ca514a4d0..94ecb67c0885 100644
> --- a/Documentation/networking/device_drivers/ethernet/index.rst
> +++ b/Documentation/networking/device_drivers/ethernet/index.rst
> @@ -15,6 +15,7 @@ Contents:
> amazon/ena
> altera/altera_tse
> amd/pds_core
> +   amd/pds_vdpa
> aquantia/atlantic
> chelsio/cxgb
> cirrus/cs89x0
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e2fd64c2ebdc..c3f509eeaf1d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -22296,6 +22296,10 @@ F: include/linux/vringh.h
>  F: include/uapi/linux/virtio_*.h
>  F: tools/virtio/
>
> +PDS DSC VIRTIO DATA PATH ACCELERATOR
> +R: Shannon Nelson 
> +F: drivers/vdpa/pds/
> +
>  VIRTIO CRYPTO DRIVER
>  M: Gonglei 
>  L: virtualization@lists.linux-foundation.org
> diff --

Re: [PATCH v6 virtio 10/11] pds_vdpa: subscribe to the pds_core events

2023-05-15 Thread Jason Wang
On Tue, May 16, 2023 at 10:56 AM Shannon Nelson  wrote:
>
> Register for the pds_core's notification events, primarily to
> find out when the FW has been reset so we can pass this on
> back up the chain.
>
> Signed-off-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/pds/vdpa_dev.c | 59 -
>  drivers/vdpa/pds/vdpa_dev.h |  1 +
>  2 files changed, 59 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
> index 07b98dff5701..9afa803c4f21 100644
> --- a/drivers/vdpa/pds/vdpa_dev.c
> +++ b/drivers/vdpa/pds/vdpa_dev.c
> @@ -23,6 +23,52 @@ static struct pds_vdpa_device *vdpa_to_pdsv(struct 
> vdpa_device *vdpa_dev)
> return container_of(vdpa_dev, struct pds_vdpa_device, vdpa_dev);
>  }
>
> +static int pds_vdpa_notify_handler(struct notifier_block *nb,
> +  unsigned long ecode,
> +  void *data)
> +{
> +   struct pds_vdpa_device *pdsv = container_of(nb, struct 
> pds_vdpa_device, nb);
> +   struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
> +
> +   dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
> +
> +   if (ecode == PDS_EVENT_RESET || ecode == PDS_EVENT_LINK_CHANGE) {
> +   if (pdsv->config_cb.callback)
> +   pdsv->config_cb.callback(pdsv->config_cb.private);
> +   }
> +
> +   return 0;
> +}
> +
> +static int pds_vdpa_register_event_handler(struct pds_vdpa_device *pdsv)
> +{
> +   struct device *dev = &pdsv->vdpa_aux->padev->aux_dev.dev;
> +   struct notifier_block *nb = &pdsv->nb;
> +   int err;
> +
> +   if (!nb->notifier_call) {
> +   nb->notifier_call = pds_vdpa_notify_handler;
> +   err = pdsc_register_notify(nb);
> +   if (err) {
> +   nb->notifier_call = NULL;
> +   dev_err(dev, "failed to register pds event handler: 
> %ps\n",
> +   ERR_PTR(err));
> +   return -EINVAL;
> +   }
> +   dev_dbg(dev, "pds event handler registered\n");
> +   }
> +
> +   return 0;
> +}
> +
> +static void pds_vdpa_unregister_event_handler(struct pds_vdpa_device *pdsv)
> +{
> +   if (pdsv->nb.notifier_call) {
> +   pdsc_unregister_notify(&pdsv->nb);
> +   pdsv->nb.notifier_call = NULL;
> +   }
> +}
> +
>  static int pds_vdpa_set_vq_address(struct vdpa_device *vdpa_dev, u16 qid,
>u64 desc_addr, u64 driver_addr, u64 
> device_addr)
>  {
> @@ -594,6 +640,12 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
>
> pdsv->vdpa_dev.mdev = &vdpa_aux->vdpa_mdev;
>
> +   err = pds_vdpa_register_event_handler(pdsv);
> +   if (err) {
> +   dev_err(dev, "Failed to register for PDS events: %pe\n", 
> ERR_PTR(err));
> +   goto err_unmap;
> +   }
> +
> /* We use the _vdpa_register_device() call rather than the
>  * vdpa_register_device() to avoid a deadlock because our
>  * dev_add() is called with the vdpa_dev_lock already set
> @@ -602,13 +654,15 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
> err = _vdpa_register_device(&pdsv->vdpa_dev, pdsv->num_vqs);
> if (err) {
> dev_err(dev, "Failed to register to vDPA bus: %pe\n", 
> ERR_PTR(err));
> -   goto err_unmap;
> +   goto err_unevent;
> }
>
> pds_vdpa_debugfs_add_vdpadev(vdpa_aux);
>
> return 0;
>
> +err_unevent:
> +   pds_vdpa_unregister_event_handler(pdsv);
>  err_unmap:
> put_device(&pdsv->vdpa_dev.dev);
> vdpa_aux->pdsv = NULL;
> @@ -618,8 +672,11 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, 
> const char *name,
>  static void pds_vdpa_dev_del(struct vdpa_mgmt_dev *mdev,
>  struct vdpa_device *vdpa_dev)
>  {
> +   struct pds_vdpa_device *pdsv = vdpa_to_pdsv(vdpa_dev);
> struct pds_vdpa_aux *vdpa_aux;
>
> +   pds_vdpa_unregister_event_handler(pdsv);
> +
> vdpa_aux = container_of(mdev, struct pds_vdpa_aux, vdpa_mdev);
> _vdpa_unregister_device(vdpa_dev);
>
> diff --git a/drivers/vdpa/pds/vdpa_dev.h b/drivers/vdpa/pds/vdpa_dev.h
> index 25c1d192f0ef..a1bc37de9537 100644
> --- a/drivers/vd

Re: [PATCH net-next V2 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-15 Thread Jason Wang
On Tue, May 16, 2023 at 12:13 PM Michael S. Tsirkin  wrote:
>
> On Tue, May 16, 2023 at 10:44:45AM +0800, Jason Wang wrote:
> > On Mon, May 15, 2023 at 6:17 PM Michael S. Tsirkin  wrote:
> > >
> > > On Mon, May 15, 2023 at 01:13:33PM +0800, Jason Wang wrote:
> > > > On Mon, May 15, 2023 at 12:45 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Mon, May 15, 2023 at 09:05:54AM +0800, Jason Wang wrote:
> > > > > > On Wed, May 10, 2023 at 1:33 PM Michael S. Tsirkin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, Apr 17, 2023 at 11:40:58AM +0800, Jason Wang wrote:
> > > > > > > > On Fri, Apr 14, 2023 at 3:21 PM Michael S. Tsirkin 
> > > > > > > >  wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Apr 14, 2023 at 01:04:15PM +0800, Jason Wang wrote:
> > > > > > > > > > Forget to cc netdev, adding.
> > > > > > > > > >
> > > > > > > > > > On Fri, Apr 14, 2023 at 12:25 AM Michael S. Tsirkin 
> > > > > > > > > >  wrote:
> > > > > > > > > > >
> > > > > > > > > > > On Thu, Apr 13, 2023 at 02:40:26PM +0800, Jason Wang 
> > > > > > > > > > > wrote:
> > > > > > > > > > > > This patch convert rx mode setting to be done in a 
> > > > > > > > > > > > workqueue, this is
> > > > > > > > > > > > a must for allow to sleep when waiting for the cvq 
> > > > > > > > > > > > command to
> > > > > > > > > > > > response since current code is executed under addr spin 
> > > > > > > > > > > > lock.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Jason Wang 
> > > > > > > > > > >
> > > > > > > > > > > I don't like this frankly. This means that setting RX 
> > > > > > > > > > > mode which would
> > > > > > > > > > > previously be reliable, now becomes unreliable.
> > > > > > > > > >
> > > > > > > > > > It is "unreliable" by design:
> > > > > > > > > >
> > > > > > > > > >   void(*ndo_set_rx_mode)(struct 
> > > > > > > > > > net_device *dev);
> > > > > > > > > >
> > > > > > > > > > > - first of all configuration is no longer immediate
> > > > > > > > > >
> > > > > > > > > > Is immediate a hard requirement? I can see a workqueue is 
> > > > > > > > > > used at least:
> > > > > > > > > >
> > > > > > > > > > mlx5e, ipoib, efx, ...
> > > > > > > > > >
> > > > > > > > > > >   and there is no way for driver to find out when
> > > > > > > > > > >   it actually took effect
> > > > > > > > > >
> > > > > > > > > > But we know rx mode is best effort e.g it doesn't support 
> > > > > > > > > > vhost and we
> > > > > > > > > > survive from this for years.
> > > > > > > > > >
> > > > > > > > > > > - second, if device fails command, this is also not
> > > > > > > > > > >   propagated to driver, again no way for driver to find 
> > > > > > > > > > > out
> > > > > > > > > > >
> > > > > > > > > > > VDUSE needs to be fixed to do tricks to fix this
> > > > > > > > > > > without breaking normal drivers.
> > > > > > > > > >
> > > > > > > > > > It's not specific to VDUSE. For example, when using 
> > > > > > > > > > virtio-net in the
> > > > > > > > > > UP environment with any software cvq (like mlx5 via vDPA or 
> > > > > > > > > > cma
> >

Re: [PATCH net-next V2 2/2] virtio-net: sleep instead of busy waiting for cvq command

2023-05-16 Thread Jason Wang
On Wed, May 17, 2023 at 4:54 AM Michael S. Tsirkin  wrote:
>
> On Thu, Apr 13, 2023 at 02:40:27PM +0800, Jason Wang wrote:
> > We used to busy waiting on the cvq command this tends to be
> > problematic since there no way for to schedule another process which
> > may serve for the control virtqueue. This might be the case when the
> > control virtqueue is emulated by software. This patch switches to use
> > completion to allow the CPU to sleep instead of busy waiting for the
> > cvq command.
> >
> > Signed-off-by: Jason Wang 
> > ---
> > Changes since V1:
> > - use completion for simplicity
> > - don't try to harden the CVQ command which requires more thought
> > Changes since RFC:
> > - break the device when timeout
> > - get buffer manually since the virtio core check more_used() instead
> > ---
> >  drivers/net/virtio_net.c | 21 ++---
> >  1 file changed, 14 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 2e56bbf86894..d3eb8fd6c9dc 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -19,6 +19,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -295,6 +296,8 @@ struct virtnet_info {
> >
> >   /* failover when STANDBY feature enabled */
> >   struct failover *failover;
> > +
> > + struct completion completion;
> >  };
> >
> >  struct padded_vnet_hdr {
> > @@ -1709,6 +1712,13 @@ static bool try_fill_recv(struct virtnet_info *vi, 
> > struct receive_queue *rq,
> >   return !oom;
> >  }
> >
> > +static void virtnet_cvq_done(struct virtqueue *cvq)
> > +{
> > + struct virtnet_info *vi = cvq->vdev->priv;
> > +
> > + complete(&vi->completion);
> > +}
> > +
> >  static void skb_recv_done(struct virtqueue *rvq)
> >  {
> >   struct virtnet_info *vi = rvq->vdev->priv;
> > @@ -2169,12 +2179,8 @@ static bool virtnet_send_command(struct virtnet_info 
> > *vi, u8 class, u8 cmd,
> >   if (unlikely(!virtqueue_kick(vi->cvq)))
> >   return vi->ctrl->status == VIRTIO_NET_OK;
> >
> > - /* Spin for a response, the kick causes an ioport write, trapping
> > -  * into the hypervisor, so the request should be handled immediately.
> > -  */
> > - while (!virtqueue_get_buf(vi->cvq, &tmp) &&
> > -!virtqueue_is_broken(vi->cvq))
> > - cpu_relax();
> > + wait_for_completion(&vi->completion);
> > + virtqueue_get_buf(vi->cvq, &tmp);
> >
> >   return vi->ctrl->status == VIRTIO_NET_OK;
>
> This seems to break surprise removal and other
> situations where vq gets broken since callbacks
> aren't usually invoked then.

Yes, so I think I can go back to the original idea by simply adding
cond_resched() here.

>
>
> >  }
> > @@ -3672,7 +3678,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >
> >   /* Parameters for control virtqueue, if any */
> >   if (vi->has_cvq) {
> > - callbacks[total_vqs - 1] = NULL;
> > + callbacks[total_vqs - 1] = virtnet_cvq_done;
> >   names[total_vqs - 1] = "control";
> >   }
> >
>
> There is a cost to this, in that we are burning an extra MSI vector
> for the slow path cvq. if device has 3 vectors, suddenly we can't
> allocate vectors for rx and tx, big problem.
>
> So I'm afraid we need to pass a new flag that will share
> the config changed interrupt and cvq.

See above, it looks to me a simple cond_resched() is sufficient, then
we don't need a new vector.

Thanks

>
>
>
> > @@ -4122,6 +4128,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >   if (vi->has_rss || vi->has_rss_hash_report)
> >   virtnet_init_default_rss(vi);
> >
> > + init_completion(&vi->completion);
> >   enable_rx_mode_work(vi);
> >
> >   /* serialize netdev register + virtio_device_ready() with ndo_open() 
> > */
> > --
> > 2.25.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 13/14] vhost: allow userspace to create workers

2023-05-16 Thread Jason Wang
On Sat, Apr 29, 2023 at 12:32 AM Mike Christie
 wrote:
>
> For vhost-scsi with 3 vqs or more and a workload that tries to use
> them in parallel like:
>
> fio --filename=/dev/sdb  --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128  --numjobs=3
>
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
>
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs on that dev.
>
> This patch adds the interface related code and the next patch will hook
> vhost-scsi into it. The patches do not try to hook net and vsock into
> the interface because:
>
> 1. multiple workers don't seem to help vsock. The problem is that with
> only 2 virtqueues we never fully use the existing worker when doing
> bidirectional tests. This seems to match vhost-scsi where we don't see
> the worker as a bottleneck until 3 virtqueues are used.
>
> 2. net already has a way to use multiple workers.
>
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/vhost.c| 145 ++-
>  drivers/vhost/vhost.h|   3 +
>  include/uapi/linux/vhost.h   |  33 +++
>  include/uapi/linux/vhost_types.h |  16 
>  4 files changed, 196 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 4b0b82292379..e8f829f35814 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -630,6 +630,80 @@ static struct vhost_worker *vhost_worker_create(struct 
> vhost_dev *dev)
> return NULL;
>  }
>
> +/* Caller must have device mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> +struct vhost_worker *worker)
> +{
> +   if (vq->worker)
> +   vq->worker->attachment_cnt--;
> +   worker->attachment_cnt++;
> +   vq->worker = worker;
> +}
> +
> +/**
> + * vhost_vq_attach_worker - set a virtqueue's worker from an ioctl command
> + * @vq: the virtqueue we will set the worker for
> + * @info: the worker userspace has requested us to use
> + *
> + * We only allow userspace to set a virtqueue's worker if it's not active and
> + * polling is not enabled.

I wonder if we can mandate this in the code like check the vq backend
in vhost_vq_work_queue().

 We also assume drivers supporting this will not be
> + * internally queueing works directly or via calls like vhost_dev_flush at
> + * this time.
> + *
> + * Caller must have device and virtqueue mutex.
> + */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> + struct vhost_vring_worker *info)
> +{
> +   unsigned long index = info->worker_id;
> +   struct vhost_dev *dev = vq->dev;
> +   struct vhost_worker *worker;
> +
> +   if (!dev->use_worker)
> +   return -EINVAL;
> +
> +   if (vhost_vq_get_backend(vq) || vq->kick)

It might be worthwhile to have a comment to explain why we need to
check vq->kick here.

This also means the device should not queue work when the backend is NULL.

But I found it is probably not the case for vsock, it calls
vhost_poll_queue() in vhost_transport_cancel_pkt() but
vhost_vsock_stop() doesn't wait before doing vhost_vq_set_backend(vq,
NULL);

Net seems to be fine since it waits for ubufs to be completed in
vhost_net_set_backend().

Can we make things easier by migrating the work_list? I also worry if
there are other corner cases which makes me think how hard it is if we
can just support those ioctls after the backend is set?


> +   return -EBUSY;
> +
> +   worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +   if (!worker || worker->id != info->worker_id)
> +   return -ENODEV;
> +
> +   __vhost_vq_attach_worker(vq, worker);
> +   return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> +   struct vhost_worker_state *info)
> +{
> +   struct vhost_worker *worker;
> +
> +   worker = vhost_worker_create(dev);
> +   if (!worker)
> +   return -ENOMEM;
> +
> +   info->worker_id = worker->id;
> +   return 0;
> +}
> +
> +static int vhost_free_worker(struct vhost_dev *dev,
> +struct vhost_worker_state *info)
> +{
> +   unsigned long index = info->worker_id;
> +   struct vhost_worker *worker;
> +
> +   worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> +   if (!worker || worker->id != info->worker_id)
> +   return -ENODEV;
> +
> +   if (worker->attachment_cnt)
> +   return -EBUSY;
> +
> +   vhost_worker_destroy(dev, worker);
> +   return 0;
> +}
> +
>  static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
>   

Re: [PATCH v1] virtio_pci: Optimize virtio_pci_device structure size

2023-05-16 Thread Jason Wang
On Tue, May 16, 2023 at 9:55 PM Feng Liu  wrote:
>
> Improve the size of the virtio_pci_device structure, which is commonly
> used to represent a virtio PCI device. A given virtio PCI device can
> either of legacy type or modern type, with the
> struct virtio_pci_legacy_device occupying 32 bytes and the
> struct virtio_pci_modern_device occupying 88 bytes. Make them a union,
> thereby save 32 bytes of memory as shown by the pahole tool. This
> improvement is particularly beneficial when dealing with numerous
> devices, as it helps conserve memory resources.
>
> Before the modification, pahole tool reported the following:
> struct virtio_pci_device {
> [...]
> struct virtio_pci_legacy_device ldev;/*   82432 */
> /* --- cacheline 13 boundary (832 bytes) was 24 bytes ago --- */
> struct virtio_pci_modern_device mdev;/*   85688 */
>
> /* XXX last struct has 4 bytes of padding */
> [...]
> /* size: 1056, cachelines: 17, members: 19 */
> [...]
> };
>
> After the modification, pahole tool reported the following:
> struct virtio_pci_device {
> [...]
> union {
> struct virtio_pci_legacy_device ldev;/*   82432 */
> struct virtio_pci_modern_device mdev;/*   82488 */
> };   /*   82488 */
> [...]
> /* size: 1024, cachelines: 16, members: 18 */
> [...]
> };
>
> Signed-off-by: Feng Liu 
> Reviewed-by: Jiri Pirko 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/virtio/virtio_pci_common.h | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/virtio/virtio_pci_common.h 
> b/drivers/virtio/virtio_pci_common.h
> index 23112d84218f..4b773bd7c58c 100644
> --- a/drivers/virtio/virtio_pci_common.h
> +++ b/drivers/virtio/virtio_pci_common.h
> @@ -45,9 +45,10 @@ struct virtio_pci_vq_info {
>  struct virtio_pci_device {
> struct virtio_device vdev;
> struct pci_dev *pci_dev;
> -   struct virtio_pci_legacy_device ldev;
> -   struct virtio_pci_modern_device mdev;
> -
> +   union {
> +   struct virtio_pci_legacy_device ldev;
> +   struct virtio_pci_modern_device mdev;
> +   };
> bool is_legacy;
>
> /* Where to read and clear interrupt */
> --
> 2.37.1 (Apple Git-137.1)
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Fix hang when cvq commands are triggered during device unregister

2023-05-16 Thread Jason Wang
 2263.129355]  ? process_one_work+0x3c0/0x3c0
>  [ 2263.129766]  worker_thread+0x4d/0x3c0
>  [ 2263.130140]  ? process_one_work+0x3c0/0x3c0
>  [ 2263.130548]  kthread+0xb9/0xe0
>  [ 2263.130895]  ? kthread_complete_and_exit+0x20/0x20
>  [ 2263.131349]  ret_from_fork+0x1f/0x30
>  [ 2263.131717]  
>
> The fix is to disable and destroy the workqueue after the device
> unregister. It is expected that vhost will not trigger kicks after
> the unregister. But even if it would, the wq is disabled already by
> setting the pointer to NULL (done so in the referenced commit).
>
> Fixes: ad6dc1daaf29 ("vdpa/mlx5: Avoid processing works if workqueue was 
> destroyed")
> Signed-off-by: Dragos Tatulea 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index e29e32b306ad..279ac6a558d2 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -3349,10 +3349,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev 
> *v_mdev, struct vdpa_device *
> mlx5_vdpa_remove_debugfs(ndev->debugfs);
> ndev->debugfs = NULL;
> unregister_link_notifier(ndev);
> +   _vdpa_unregister_device(dev);
> wq = mvdev->wq;
> mvdev->wq = NULL;
> destroy_workqueue(wq);
> -   _vdpa_unregister_device(dev);
> mgtdev->ndev = NULL;
>  }
>
> --
> 2.40.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base

2023-05-16 Thread Jason Wang
On Wed, May 17, 2023 at 2:26 AM Shannon Nelson  wrote:
>
> On 5/16/23 12:49 AM, Stefano Garzarella wrote:
> > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote:
> >> On 5/9/23 1:46 AM, Stefano Garzarella wrote:
> >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via
> >>> Virtualization wrote:
>  Use the right structs for PACKED or split vqs when setting and
>  getting the vring base.
> 
>  Signed-off-by: Shannon Nelson 
>  ---
>  drivers/vhost/vhost.c | 18 +-
>  drivers/vhost/vhost.h |  8 ++--
>  2 files changed, 19 insertions(+), 7 deletions(-)
> 
>  diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>  index f11bdbe4c2c5..f64efda48f21 100644
>  --- a/drivers/vhost/vhost.c
>  +++ b/drivers/vhost/vhost.c
>  @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev
>  *d, unsigned int ioctl, void __user *arg
>    r = -EFAULT;
>    break;
>    }
>  -  if (s.num > 0x) {
>  -  r = -EINVAL;
>  -  break;
>  +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
>  +  vq->last_avail_idx = s.num & 0x;
>  +  vq->last_used_idx = (s.num >> 16) & 0x;
>  +  } else {
>  +  if (s.num > 0x) {
>  +  r = -EINVAL;
>  +  break;
>  +  }
>  +  vq->last_avail_idx = s.num;
>    }
>  -  vq->last_avail_idx = s.num;
>    /* Forget the cached index value. */
>    vq->avail_idx = vq->last_avail_idx;
>    break;
>    case VHOST_GET_VRING_BASE:
>    s.index = idx;
>  -  s.num = vq->last_avail_idx;
>  +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
>  +  s.num = (u32)vq->last_avail_idx |
>  ((u32)vq->last_used_idx << 16);
>  +  else
>  +  s.num = vq->last_avail_idx;
> >>>
> >>> The changes LGTM, but since we are changing the UAPI, should we
> >>> update the documentation of VHOST_SET_VRING_BASE and
> >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h?
> >>
> >> Correct me if I'm wrong, but I don't think we're changing anything in
> >> the UAPI here, just fixing code to work correctly with what is already
> >> happening.
> >
> > IIUC before this patch VHOST_GET_VRING_BASE and VHOST_SET_VRING_BASE
> > never worked with packed virtqueue, since we were only handling
> > last_avail_idx. Now we are supporting packed virtqueue, handling
> > in vhost_vring_state.num both last_avail_idx and last_used_idx (with
> > wrap counters).
> >
> > For example for VHOST_GET_VRING_BASE where is documented that the first
> > 15 bits are last_avail_idx, the 16th the avail_wrap_counter, and the
> > others are last_used_idx and used_wrap_counter?
> >
> > Maybe I missed something, but since this is UAPI, IMHO we should
> > document the parameters of ioctls at least in
> > include/uapi/linux/vhost.h.
>
> Perhaps Jason already has something written up that could be put in here
> from when he first added the wrap_counter a couple of years ago?

If you meant the virtio driver support for packed, I think it's
different from the context which is vhost here.

I agree with Stefano that we need to update the comments around
GET_VRING_BASE and SET_VRING_BASE, then we are fine.

Thanks

>
> sln
>
> >
> > Thanks,
> > Stefano
> >
> > --
> > You received this message because you are subscribed to the Google
> > Groups "Pensando Drivers" group.
> > To unsubscribe from this group and stop receiving emails from it, send
> > an email to drivers+unsubscr...@pensando.io.
> > To view this discussion on the web visit
> > https://groups.google.com/a/pensando.io/d/msgid/drivers/q6cmfha36sdkgflwwd3pr4sw7rgajag4ahgjbpfjrr76w4o2b6%403yc7zs5u65s4.
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa: consume device_features parameter

2023-05-16 Thread Jason Wang
On Sat, May 13, 2023 at 12:42 AM Shannon Nelson  wrote:
>
> From: Allen Hubbe 
>
> Consume the parameter to device_features when parsing command line
> options.  Otherwise the parameter may be used again as an option name.
>
>  # vdpa dev add ... device_features 0xdeadbeef mac 00:11:22:33:44:55
>  Unknown option "0xdeadbeef"
>
> Fixes: a4442ce58ebb ("vdpa: allow provisioning device features")
> Signed-off-by: Allen Hubbe 
> Reviewed-by: Shannon Nelson 

Acked-by: Jason Wang 

Thanks

> ---
>  vdpa/vdpa.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/vdpa/vdpa.c b/vdpa/vdpa.c
> index 27647d73d498..8a2fca8647b6 100644
> --- a/vdpa/vdpa.c
> +++ b/vdpa/vdpa.c
> @@ -353,6 +353,8 @@ static int vdpa_argv_parse(struct vdpa *vdpa, int argc, 
> char **argv,
> &opts->device_features);
> if (err)
> return err;
> +
> +   NEXT_ARG_FWD();
> o_found |= VDPA_OPT_VDEV_FEATURES;
> } else {
> fprintf(stderr, "Unknown option \"%s\"\n", *argv);
> --
> 2.17.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RESEND PATCH] vdpa: solidrun: constify pointers to hwmon_channel_info

2023-05-16 Thread Jason Wang
On Fri, May 12, 2023 at 1:54 AM Krzysztof Kozlowski
 wrote:
>
> Statically allocated array of pointers to hwmon_channel_info can be made
> const for safety.
>
> Acked-by: Michael S. Tsirkin 
> Reviewed-by: Alvaro Karsz 
> Signed-off-by: Krzysztof Kozlowski 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/solidrun/snet_hwmon.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/solidrun/snet_hwmon.c 
> b/drivers/vdpa/solidrun/snet_hwmon.c
> index 42c87387a0f1..af531a339082 100644
> --- a/drivers/vdpa/solidrun/snet_hwmon.c
> +++ b/drivers/vdpa/solidrun/snet_hwmon.c
> @@ -159,7 +159,7 @@ static const struct hwmon_ops snet_hwmon_ops = {
> .read_string = snet_hwmon_read_string
>  };
>
> -static const struct hwmon_channel_info *snet_hwmon_info[] = {
> +static const struct hwmon_channel_info * const snet_hwmon_info[] = {
> HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_MAX | HWMON_T_CRIT | 
> HWMON_T_LABEL,
>HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_LABEL),
> HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL),
> --
> 2.34.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella  wrote:
>
> On Wed, May 17, 2023 at 7:26 AM Jason Wang  wrote:
> >
> > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson  
> > wrote:
> > >
> > > On 5/16/23 12:49 AM, Stefano Garzarella wrote:
> > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote:
> > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote:
> > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via
> > > >>> Virtualization wrote:
> > > >>>> Use the right structs for PACKED or split vqs when setting and
> > > >>>> getting the vring base.
> > > >>>>
> > > >>>> Signed-off-by: Shannon Nelson 
> > > >>>> ---
> > > >>>> drivers/vhost/vhost.c | 18 +-
> > > >>>> drivers/vhost/vhost.h |  8 ++--
> > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-)
> > > >>>>
> > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > >>>> index f11bdbe4c2c5..f64efda48f21 100644
> > > >>>> --- a/drivers/vhost/vhost.c
> > > >>>> +++ b/drivers/vhost/vhost.c
> > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev
> > > >>>> *d, unsigned int ioctl, void __user *arg
> > > >>>>   r = -EFAULT;
> > > >>>>   break;
> > > >>>>   }
> > > >>>> -  if (s.num > 0x) {
> > > >>>> -  r = -EINVAL;
> > > >>>> -  break;
> > > >>>> +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
> > > >>>> +  vq->last_avail_idx = s.num & 0x;
> > > >>>> +  vq->last_used_idx = (s.num >> 16) & 0x;
> > > >>>> +  } else {
> > > >>>> +  if (s.num > 0x) {
> > > >>>> +  r = -EINVAL;
> > > >>>> +  break;
> > > >>>> +  }
> > > >>>> +  vq->last_avail_idx = s.num;
> > > >>>>   }
> > > >>>> -  vq->last_avail_idx = s.num;
> > > >>>>   /* Forget the cached index value. */
> > > >>>>   vq->avail_idx = vq->last_avail_idx;
> > > >>>>   break;
> > > >>>>   case VHOST_GET_VRING_BASE:
> > > >>>>   s.index = idx;
> > > >>>> -  s.num = vq->last_avail_idx;
> > > >>>> +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> > > >>>> +  s.num = (u32)vq->last_avail_idx |
> > > >>>> ((u32)vq->last_used_idx << 16);
> > > >>>> +  else
> > > >>>> +  s.num = vq->last_avail_idx;
> > > >>>
> > > >>> The changes LGTM, but since we are changing the UAPI, should we
> > > >>> update the documentation of VHOST_SET_VRING_BASE and
> > > >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h?
> > > >>
> > > >> Correct me if I'm wrong, but I don't think we're changing anything in
> > > >> the UAPI here, just fixing code to work correctly with what is already
> > > >> happening.
> > > >
> > > > IIUC before this patch VHOST_GET_VRING_BASE and VHOST_SET_VRING_BASE
> > > > never worked with packed virtqueue, since we were only handling
> > > > last_avail_idx. Now we are supporting packed virtqueue, handling
> > > > in vhost_vring_state.num both last_avail_idx and last_used_idx (with
> > > > wrap counters).
> > > >
> > > > For example for VHOST_GET_VRING_BASE where is documented that the first
> > > > 15 bits are last_avail_idx, the 16th the avail_wrap_counter, and the
> > > > others are last_used_idx and used_wrap_counter?
> > > >
> > > > Maybe I missed something, but since this is UAPI, IMHO we should
> > > > document the parameters of ioctls at least in
> > > > include/uapi/linux/vhost.h.
> > >
> > > Perhaps Jason already has something written up that could be put in here
> > > from when he first added the wrap_counter a couple of years ago?
> >
> > If you meant the virtio driver support for packed, I think it's
> > different from the context which is vhost here.
> >
> > I agree with Stefano that we need to update the comments around
> > GET_VRING_BASE and SET_VRING_BASE, then we are fine.
>
> I'm thinking if we should also add a new VHOST_BACKEND_F_RING_PACKED
> feature (or something similar) to inform the user space that now we
> are able to handle packed virtqueue through vhost IOCTLs, otherwise
> how can the userspace know if it is supported or not?

I probably understand this but I think it should be done via
VHOST_GET_FEAETURES. It would be a burden if we matianing duplicated
features.

Thanks

>
> Thanks,
> Stefano
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v9 01/12] virtio_ring: put mapping error check in vring_map_one_sg

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:
>
> This patch put the dma addr error check in vring_map_one_sg().
>
> The benefits of doing this:
>
> 1. make vring_map_one_sg more simple, without calling
>vring_mapping_error to check the return value.
> 2. reduce one judgment of vq->use_dma_api.

Code looks fine but it's better to explain how it relates or simply
anything with this series.

Thanks


>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 37 +---
>  1 file changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c5310eaf8b46..c563215be6b9 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -355,9 +355,8 @@ static struct device *vring_dma_dev(const struct 
> vring_virtqueue *vq)
>  }
>
>  /* Map one sg entry. */
> -static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
> -  struct scatterlist *sg,
> -  enum dma_data_direction direction)
> +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> scatterlist *sg,
> +   enum dma_data_direction direction, static 
> dma_addr_t *addr)
>  {
> if (!vq->use_dma_api) {
> /*
> @@ -366,7 +365,8 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * depending on the direction.
>  */
> kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, 
> direction);
> -   return (dma_addr_t)sg_phys(sg);
> +   *addr = (dma_addr_t)sg_phys(sg);
> +   return 0;
> }
>
> /*
> @@ -374,9 +374,14 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * the way it expects (we don't guarantee that the scatterlist
>  * will exist for the lifetime of the mapping).
>  */
> -   return dma_map_page(vring_dma_dev(vq),
> +   *addr = dma_map_page(vring_dma_dev(vq),
> sg_page(sg), sg->offset, sg->length,
> direction);
> +
> +   if (dma_mapping_error(vring_dma_dev(vq), *addr))
> +   return -ENOMEM;
> +
> +   return 0;
>  }
>
>  static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> @@ -588,8 +593,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
>
> for (n = 0; n < out_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_TO_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -603,8 +609,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> }
> for (; n < (out_sgs + in_sgs); n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -1279,9 +1286,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> goto unmap_release;
>
> desc[i].flags = cpu_to_le16(n < out_sgs ?
> @@ -1426,9 +1432,10 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> c = 0;
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, n < 
> out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> goto unmap_release;
>
> flags = cpu_to_le16(vq->packed.av

Re: [PATCH vhost v9 02/12] virtio_ring: simplify the reference of desc state inside detach_buf_split()

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:
>
> The purpose of this is to simplify the reference to state. It is
> convenient for subsequent commit.

It's better to be verbose, e.g how it can simplify the following patches.

Thanks


>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 12 +++-
>  1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c563215be6b9..479203346c36 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -744,11 +744,14 @@ static bool virtqueue_kick_prepare_split(struct 
> virtqueue *_vq)
>  static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>  void **ctx)
>  {
> +   struct vring_desc_state_split *state;
> unsigned int i, j;
> __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>
> +   state = &vq->split.desc_state[head];
> +
> /* Clear data ptr. */
> -   vq->split.desc_state[head].data = NULL;
> +   state->data = NULL;
>
> /* Put back on free list: unmap first-level descriptors and find end 
> */
> i = head;
> @@ -767,8 +770,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, 
> unsigned int head,
> vq->vq.num_free++;
>
> if (vq->indirect) {
> -   struct vring_desc *indir_desc =
> -   vq->split.desc_state[head].indir_desc;
> +   struct vring_desc *indir_desc = state->indir_desc;
> u32 len;
>
> /* Free the indirect table, if any, now that it's unmapped. */
> @@ -785,9 +787,9 @@ static void detach_buf_split(struct vring_virtqueue *vq, 
> unsigned int head,
> vring_unmap_one_split_indirect(vq, &indir_desc[j]);
>
> kfree(indir_desc);
> -   vq->split.desc_state[head].indir_desc = NULL;
> +   state->indir_desc = NULL;
> } else if (ctx) {
> -   *ctx = vq->split.desc_state[head].indir_desc;
> +   *ctx = state->indir_desc;
> }
>  }
>
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v9 03/12] virtio_ring: check use_dma_api before unmap desc for indirect

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:
>
> Inside detach_buf_split(), if use_dma_api is false,
> vring_unmap_one_split_indirect will be called many times, but actually
> nothing is done. So this patch check use_dma_api firstly.
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/virtio/virtio_ring.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 479203346c36..1ffab1eb40c0 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -783,8 +783,10 @@ static void detach_buf_split(struct vring_virtqueue *vq, 
> unsigned int head,
> VRING_DESC_F_INDIRECT));
> BUG_ON(len == 0 || len % sizeof(struct vring_desc));
>
> -   for (j = 0; j < len / sizeof(struct vring_desc); j++)
> -   vring_unmap_one_split_indirect(vq, &indir_desc[j]);
> +   if (vq->use_dma_api) {
> +   for (j = 0; j < len / sizeof(struct vring_desc); j++)
> +   vring_unmap_one_split_indirect(vq, 
> &indir_desc[j]);
> +   }
>
> kfree(indir_desc);
> state->indir_desc = NULL;
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v9 04/12] virtio_ring: virtqueue_add() support premapped

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:
>
> virtuque_add() adds parameter premapped.

I wonder if this patch is over simplified. Maybe it can be squashed
with the patch that implements the premapped logic.

Thanks


>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 1ffab1eb40c0..e2fc50c05bec 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -2135,6 +2135,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> unsigned int in_sgs,
> void *data,
> void *ctx,
> +   bool premapped,
> gfp_t gfp)
>  {
> struct vring_virtqueue *vq = to_vvq(_vq);
> @@ -2176,7 +2177,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
> total_sg++;
> }
> return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
> -data, NULL, gfp);
> +data, NULL, false, gfp);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
>
> @@ -2198,7 +2199,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
>  void *data,
>  gfp_t gfp)
>  {
> -   return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp);
> +   return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
>
> @@ -2220,7 +2221,7 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
> void *data,
> gfp_t gfp)
>  {
> -   return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp);
> +   return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
>
> @@ -2244,7 +2245,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
> void *ctx,
> gfp_t gfp)
>  {
> -   return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp);
> +   return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp);
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
>
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-17 Thread Jason Wang
On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:
>
> virtqueue_add_split() only supports virtual addresses, dma is completed
> in virtqueue_add_split().
>
> In some scenarios (such as the AF_XDP scenario), the memory is allocated
> and DMA is completed in advance, so it is necessary for us to support
> passing the DMA address to virtqueue_add_split().
>
> Record this information in desc_state, we can skip unmap based on this
> when executing dma unmap.

I would also suggest documenting why a per descriptor metadata is
needed instead of a per virtqueue one.

>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 38 +++-
>  1 file changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index e2fc50c05bec..bd5e84afab37 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -70,6 +70,7 @@
>  struct vring_desc_state_split {
> void *data; /* Data for callback. */
> struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
> +   bool premapped; /* DMA mapping is done by driver. */

Going back to the original discussion around where this should be
placed. I wonder if we can find a common place to store this since it
has nothing related to virtqueue layout. Maybe desc_extra? And it
would be even better if we can avoid stressing the cache like above.

>  };
>
>  struct vring_desc_state_packed {
> @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct 
> vring_virtqueue *vq)
>
>  /* Map one sg entry. */
>  static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> scatterlist *sg,
> -   enum dma_data_direction direction, static 
> dma_addr_t *addr)
> +   enum dma_data_direction direction,
> +   bool premapped, dma_addr_t *addr)

having things like:

int func(bool do)
{
if (!do)
return;
}

is a hint that the check needs to be done by the caller?

And this change should work for both packed and split. I think we need
to squash the packed changes here.

Looking at how packed virtqueue uses this in this patch, I don't think
this patch can even be built. I will wait for a new version and
continue the review from there.

Thanks



>  {
> +   if (premapped) {
> +   *addr = sg_dma_address(sg);
> +   return 0;
> +   }
> +
> if (!vq->use_dma_api) {
> /*
>  * If DMA is not used, KMSAN doesn't know that the scatterlist
> @@ -445,7 +452,7 @@ static void vring_unmap_one_split_indirect(const struct 
> vring_virtqueue *vq,
>  }
>
>  static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
> - unsigned int i)
> + unsigned int i, bool premapped)
>  {
> struct vring_desc_extra *extra = vq->split.desc_extra;
> u16 flags;
> @@ -462,6 +469,9 @@ static unsigned int vring_unmap_one_split(const struct 
> vring_virtqueue *vq,
>  (flags & VRING_DESC_F_WRITE) ?
>  DMA_FROM_DEVICE : DMA_TO_DEVICE);
> } else {
> +   if (premapped)
> +   goto out;
> +
> dma_unmap_page(vring_dma_dev(vq),
>extra[i].addr,
>extra[i].len,
> @@ -532,6 +542,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
>   unsigned int in_sgs,
>   void *data,
>   void *ctx,
> + bool premapped,
>   gfp_t gfp)
>  {
> struct vring_virtqueue *vq = to_vvq(_vq);
> @@ -595,7 +606,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> premapped, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -611,7 +622,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> premapped, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -657,6 +668,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
>
> /* Store token and indirect buffer state. */
> vq->split.

Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 3:34 PM Stefano Garzarella  wrote:
>
> On Thu, May 18, 2023 at 7:24 AM Jason Wang  wrote:
> >
> > On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella  
> > wrote:
> > >
> > > On Wed, May 17, 2023 at 7:26 AM Jason Wang  wrote:
> > > >
> > > > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson  
> > > > wrote:
> > > > >
> > > > > On 5/16/23 12:49 AM, Stefano Garzarella wrote:
> > > > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote:
> > > > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote:
> > > > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via
> > > > > >>> Virtualization wrote:
> > > > > >>>> Use the right structs for PACKED or split vqs when setting and
> > > > > >>>> getting the vring base.
> > > > > >>>>
> > > > > >>>> Signed-off-by: Shannon Nelson 
> > > > > >>>> ---
> > > > > >>>> drivers/vhost/vhost.c | 18 +-
> > > > > >>>> drivers/vhost/vhost.h |  8 ++--
> > > > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-)
> > > > > >>>>
> > > > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > >>>> index f11bdbe4c2c5..f64efda48f21 100644
> > > > > >>>> --- a/drivers/vhost/vhost.c
> > > > > >>>> +++ b/drivers/vhost/vhost.c
> > > > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct vhost_dev
> > > > > >>>> *d, unsigned int ioctl, void __user *arg
> > > > > >>>>   r = -EFAULT;
> > > > > >>>>   break;
> > > > > >>>>   }
> > > > > >>>> -  if (s.num > 0x) {
> > > > > >>>> -  r = -EINVAL;
> > > > > >>>> -  break;
> > > > > >>>> +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
> > > > > >>>> +  vq->last_avail_idx = s.num & 0x;
> > > > > >>>> +  vq->last_used_idx = (s.num >> 16) & 
> > > > > >>>> 0x;
> > > > > >>>> +  } else {
> > > > > >>>> +  if (s.num > 0x) {
> > > > > >>>> +  r = -EINVAL;
> > > > > >>>> +  break;
> > > > > >>>> +  }
> > > > > >>>> +  vq->last_avail_idx = s.num;
> > > > > >>>>   }
> > > > > >>>> -  vq->last_avail_idx = s.num;
> > > > > >>>>   /* Forget the cached index value. */
> > > > > >>>>   vq->avail_idx = vq->last_avail_idx;
> > > > > >>>>   break;
> > > > > >>>>   case VHOST_GET_VRING_BASE:
> > > > > >>>>   s.index = idx;
> > > > > >>>> -  s.num = vq->last_avail_idx;
> > > > > >>>> +  if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> > > > > >>>> +  s.num = (u32)vq->last_avail_idx |
> > > > > >>>> ((u32)vq->last_used_idx << 16);
> > > > > >>>> +  else
> > > > > >>>> +  s.num = vq->last_avail_idx;
> > > > > >>>
> > > > > >>> The changes LGTM, but since we are changing the UAPI, should we
> > > > > >>> update the documentation of VHOST_SET_VRING_BASE and
> > > > > >>> VHOST_GET_VRING_BASE in include/uapi/linux/vhost.h?
> > > > > >>
> > > > > >> Correct me if I'm wrong, but I don't think we're changing anything 
> > > > > >> in
> > > > > >> the UAPI here, just fixing code to work correctly with what is 
> > > &

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 3:41 PM Xuan Zhuo  wrote:
>
> On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin"  
> wrote:
> > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote:
> > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  
> > > wrote:
> > > >
> > > > virtqueue_add_split() only supports virtual addresses, dma is completed
> > > > in virtqueue_add_split().
> > > >
> > > > In some scenarios (such as the AF_XDP scenario), the memory is allocated
> > > > and DMA is completed in advance, so it is necessary for us to support
> > > > passing the DMA address to virtqueue_add_split().
> > > >
> > > > Record this information in desc_state, we can skip unmap based on this
> > > > when executing dma unmap.
> > >
> > > I would also suggest documenting why a per descriptor metadata is
> > > needed instead of a per virtqueue one.
> >
> > I think we could make it per virtqueue. That would mean all code in
> > virtio net would have to change to do dma mapping itself instead of
> > relying on virtio core though.  Which is maybe a good idea? Definitely a
> > very intrusive change though, will need a lot of performance testing
> > to make sure we don't break anything.
>
> In fact, we have tried this idea.
>
> The problem is the detach and unmap.
>
> We need to get all DMA Addresses from virtio-ring to unmap. Currently, it does
> not support to return the DMA Address,

I'm not sure I got here, but we've already stored the DMA address in desc_extra?

> and for SKB, we need to get multiple DMA
> Addresses at one time.

Could you elaborate on this?

Thanks

>
> This need to modify the logic of Virtio-Ring detach. Besides this, I also 
> agree
> with this idea.
>
> Thanks.
>
>
> >
> >
> >
> >
> > > >
> > > > Signed-off-by: Xuan Zhuo 
> > > > ---
> > > >  drivers/virtio/virtio_ring.c | 38 +++-
> > > >  1 file changed, 29 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index e2fc50c05bec..bd5e84afab37 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -70,6 +70,7 @@
> > > >  struct vring_desc_state_split {
> > > > void *data; /* Data for callback. */
> > > > struct vring_desc *indir_desc;  /* Indirect descriptor, if any. 
> > > > */
> > > > +   bool premapped; /* DMA mapping is done by 
> > > > driver. */
> > >
> > > Going back to the original discussion around where this should be
> > > placed. I wonder if we can find a common place to store this since it
> > > has nothing related to virtqueue layout. Maybe desc_extra? And it
> > > would be even better if we can avoid stressing the cache like above.
> > >
> > > >  };
> > > >
> > > >  struct vring_desc_state_packed {
> > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct 
> > > > vring_virtqueue *vq)
> > > >
> > > >  /* Map one sg entry. */
> > > >  static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> > > > scatterlist *sg,
> > > > -   enum dma_data_direction direction, static 
> > > > dma_addr_t *addr)
> > > > +   enum dma_data_direction direction,
> > > > +   bool premapped, dma_addr_t *addr)
> > >
> > > having things like:
> > >
> > > int func(bool do)
> > > {
> > > if (!do)
> > > return;
> > > }
> > >
> > > is a hint that the check needs to be done by the caller?
> > >
> > > And this change should work for both packed and split. I think we need
> > > to squash the packed changes here.
> > >
> > > Looking at how packed virtqueue uses this in this patch, I don't think
> > > this patch can even be built. I will wait for a new version and
> > > continue the review from there.
> > >
> > > Thanks
> > >
> > >
> > >
> > > >  {
> > > > +   if (premapped) {
> > > > +   *addr = sg_dma_address(sg);
> > > > +   return 0;
> > > > +

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin  wrote:
>
> On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote:
> > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin"  
> > wrote:
> > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote:
> > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  
> > > > wrote:
> > > > >
> > > > > virtqueue_add_split() only supports virtual addresses, dma is 
> > > > > completed
> > > > > in virtqueue_add_split().
> > > > >
> > > > > In some scenarios (such as the AF_XDP scenario), the memory is 
> > > > > allocated
> > > > > and DMA is completed in advance, so it is necessary for us to support
> > > > > passing the DMA address to virtqueue_add_split().
> > > > >
> > > > > Record this information in desc_state, we can skip unmap based on this
> > > > > when executing dma unmap.
> > > >
> > > > I would also suggest documenting why a per descriptor metadata is
> > > > needed instead of a per virtqueue one.
> > >
> > > I think we could make it per virtqueue. That would mean all code in
> > > virtio net would have to change to do dma mapping itself instead of
> > > relying on virtio core though.  Which is maybe a good idea? Definitely a
> > > very intrusive change though, will need a lot of performance testing
> > > to make sure we don't break anything.
> >
> > In fact, we have tried this idea.
> >
> > The problem is the detach and unmap.
> >
> > We need to get all DMA Addresses from virtio-ring to unmap. Currently, it 
> > does
> > not support to return the DMA Address, and for SKB, we need to get multiple 
> > DMA
> > Addresses at one time.
> >
> > This need to modify the logic of Virtio-Ring detach. Besides this, I also 
> > agree
> > with this idea.
> >
> > Thanks.
>
> Well you can have a version of get_buf that returns them ... but
> it is not clear to me all this is worth it unless you want
> to do unsafe tricks like leaving them mapped.

Some high speed NIC drivers use this trick for better performance.

> I'd leave that
> for another day maybe.
>
> For marking desc as premapped I think we can use a bit from
> desc_extra->flags, either reusing one of NEXT,AVAIL,USED, or stealing
> another one.

Probably.

Thanks

>
>
>
> >
> > >
> > >
> > >
> > >
> > > > >
> > > > > Signed-off-by: Xuan Zhuo 
> > > > > ---
> > > > >  drivers/virtio/virtio_ring.c | 38 
> > > > > +++-
> > > > >  1 file changed, 29 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio_ring.c 
> > > > > b/drivers/virtio/virtio_ring.c
> > > > > index e2fc50c05bec..bd5e84afab37 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -70,6 +70,7 @@
> > > > >  struct vring_desc_state_split {
> > > > > void *data; /* Data for callback. */
> > > > > struct vring_desc *indir_desc;  /* Indirect descriptor, if 
> > > > > any. */
> > > > > +   bool premapped; /* DMA mapping is done by 
> > > > > driver. */
> > > >
> > > > Going back to the original discussion around where this should be
> > > > placed. I wonder if we can find a common place to store this since it
> > > > has nothing related to virtqueue layout. Maybe desc_extra? And it
> > > > would be even better if we can avoid stressing the cache like above.
> > > >
> > > > >  };
> > > > >
> > > > >  struct vring_desc_state_packed {
> > > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct 
> > > > > vring_virtqueue *vq)
> > > > >
> > > > >  /* Map one sg entry. */
> > > > >  static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> > > > > scatterlist *sg,
> > > > > -   enum dma_data_direction direction, static 
> > > > > dma_addr_t *addr)
> > > > > +   enum dma_data_direction direction,
> > > > > + 

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 3:57 PM Xuan Zhuo  wrote:
>
> On Thu, 18 May 2023 15:54:09 +0800, Jason Wang  wrote:
> > On Thu, May 18, 2023 at 3:41 PM Xuan Zhuo  
> > wrote:
> > >
> > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" 
> > >  wrote:
> > > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote:
> > > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo 
> > > > >  wrote:
> > > > > >
> > > > > > virtqueue_add_split() only supports virtual addresses, dma is 
> > > > > > completed
> > > > > > in virtqueue_add_split().
> > > > > >
> > > > > > In some scenarios (such as the AF_XDP scenario), the memory is 
> > > > > > allocated
> > > > > > and DMA is completed in advance, so it is necessary for us to 
> > > > > > support
> > > > > > passing the DMA address to virtqueue_add_split().
> > > > > >
> > > > > > Record this information in desc_state, we can skip unmap based on 
> > > > > > this
> > > > > > when executing dma unmap.
> > > > >
> > > > > I would also suggest documenting why a per descriptor metadata is
> > > > > needed instead of a per virtqueue one.
> > > >
> > > > I think we could make it per virtqueue. That would mean all code in
> > > > virtio net would have to change to do dma mapping itself instead of
> > > > relying on virtio core though.  Which is maybe a good idea? Definitely a
> > > > very intrusive change though, will need a lot of performance testing
> > > > to make sure we don't break anything.
> > >
> > > In fact, we have tried this idea.
> > >
> > > The problem is the detach and unmap.
> > >
> > > We need to get all DMA Addresses from virtio-ring to unmap. Currently, it 
> > > does
> > > not support to return the DMA Address,
> >
> > I'm not sure I got here, but we've already stored the DMA address in 
> > desc_extra?
>
>
> I mean we need to get the dma address from the virtio-core to virtio-net.
>

It probably just requires a new helper.

Thanks

> Thanks.
>
>
> >
> > > and for SKB, we need to get multiple DMA
> > > Addresses at one time.
> >
> > Could you elaborate on this?
> >
> > Thanks
> >
> > >
> > > This need to modify the logic of Virtio-Ring detach. Besides this, I also 
> > > agree
> > > with this idea.
> > >
> > > Thanks.
> > >
> > >
> > > >
> > > >
> > > >
> > > >
> > > > > >
> > > > > > Signed-off-by: Xuan Zhuo 
> > > > > > ---
> > > > > >  drivers/virtio/virtio_ring.c | 38 
> > > > > > +++-
> > > > > >  1 file changed, 29 insertions(+), 9 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio_ring.c 
> > > > > > b/drivers/virtio/virtio_ring.c
> > > > > > index e2fc50c05bec..bd5e84afab37 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -70,6 +70,7 @@
> > > > > >  struct vring_desc_state_split {
> > > > > > void *data; /* Data for callback. */
> > > > > > struct vring_desc *indir_desc;  /* Indirect descriptor, if 
> > > > > > any. */
> > > > > > +   bool premapped; /* DMA mapping is done by 
> > > > > > driver. */
> > > > >
> > > > > Going back to the original discussion around where this should be
> > > > > placed. I wonder if we can find a common place to store this since it
> > > > > has nothing related to virtqueue layout. Maybe desc_extra? And it
> > > > > would be even better if we can avoid stressing the cache like above.
> > > > >
> > > > > >  };
> > > > > >
> > > > > >  struct vring_desc_state_packed {
> > > > > > @@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const 
> > > > > > struct vring_virtqueue *vq)
> > > > > >
> > > > > >  /* Map one sg entry. */
> 

Re: [PATCH v2 2/3] vhost: support PACKED when setting-getting vring_base

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 4:38 PM Michael S. Tsirkin  wrote:
>
> On Thu, May 18, 2023 at 03:52:10PM +0800, Jason Wang wrote:
> > On Thu, May 18, 2023 at 3:34 PM Stefano Garzarella  
> > wrote:
> > >
> > > On Thu, May 18, 2023 at 7:24 AM Jason Wang  wrote:
> > > >
> > > > On Wed, May 17, 2023 at 3:00 PM Stefano Garzarella 
> > > >  wrote:
> > > > >
> > > > > On Wed, May 17, 2023 at 7:26 AM Jason Wang  
> > > > > wrote:
> > > > > >
> > > > > > On Wed, May 17, 2023 at 2:26 AM Shannon Nelson 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On 5/16/23 12:49 AM, Stefano Garzarella wrote:
> > > > > > > > On Mon, May 15, 2023 at 01:41:12PM -0700, Shannon Nelson wrote:
> > > > > > > >> On 5/9/23 1:46 AM, Stefano Garzarella wrote:
> > > > > > > >>> On Mon, Apr 24, 2023 at 03:50:30PM -0700, Shannon Nelson via
> > > > > > > >>> Virtualization wrote:
> > > > > > > >>>> Use the right structs for PACKED or split vqs when setting 
> > > > > > > >>>> and
> > > > > > > >>>> getting the vring base.
> > > > > > > >>>>
> > > > > > > >>>> Signed-off-by: Shannon Nelson 
> > > > > > > >>>> ---
> > > > > > > >>>> drivers/vhost/vhost.c | 18 +-
> > > > > > > >>>> drivers/vhost/vhost.h |  8 ++--
> > > > > > > >>>> 2 files changed, 19 insertions(+), 7 deletions(-)
> > > > > > > >>>>
> > > > > > > >>>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > >>>> index f11bdbe4c2c5..f64efda48f21 100644
> > > > > > > >>>> --- a/drivers/vhost/vhost.c
> > > > > > > >>>> +++ b/drivers/vhost/vhost.c
> > > > > > > >>>> @@ -1633,17 +1633,25 @@ long vhost_vring_ioctl(struct 
> > > > > > > >>>> vhost_dev
> > > > > > > >>>> *d, unsigned int ioctl, void __user *arg
> > > > > > > >>>>   r = -EFAULT;
> > > > > > > >>>>   break;
> > > > > > > >>>>   }
> > > > > > > >>>> -  if (s.num > 0x) {
> > > > > > > >>>> -  r = -EINVAL;
> > > > > > > >>>> -  break;
> > > > > > > >>>> +  if (vhost_has_feature(vq, 
> > > > > > > >>>> VIRTIO_F_RING_PACKED)) {
> > > > > > > >>>> +  vq->last_avail_idx = s.num & 0x;
> > > > > > > >>>> +  vq->last_used_idx = (s.num >> 16) & 
> > > > > > > >>>> 0x;
> > > > > > > >>>> +  } else {
> > > > > > > >>>> +  if (s.num > 0x) {
> > > > > > > >>>> +  r = -EINVAL;
> > > > > > > >>>> +  break;
> > > > > > > >>>> +  }
> > > > > > > >>>> +  vq->last_avail_idx = s.num;
> > > > > > > >>>>   }
> > > > > > > >>>> -  vq->last_avail_idx = s.num;
> > > > > > > >>>>   /* Forget the cached index value. */
> > > > > > > >>>>   vq->avail_idx = vq->last_avail_idx;
> > > > > > > >>>>   break;
> > > > > > > >>>>   case VHOST_GET_VRING_BASE:
> > > > > > > >>>>   s.index = idx;
> > > > > > > >>>> -  s.num = vq->last_avail_idx;
> > > > > > > >>>> +  if (vhost_has_feature(vq, 
> > > > > > > >>>> VIRTIO_F_RING_PACKED))
> > > > &

Re: [RFC PATCH v2 3/3] PCI: endpoint: Add EP function driver to provide virtio-console functionality

2023-05-18 Thread Jason Wang
On Thu, May 18, 2023 at 5:54 PM Shunsuke Mie  wrote:
>
> Gentle ping ...
>
>
> Thanks,
>
> Shunsuke.
>
> On 2023/05/10 12:17, Shunsuke Mie wrote:
> > Hi Json,
> > 2023年5月8日(月) 13:03 Jason Wang :
> >> On Thu, Apr 27, 2023 at 6:44 PM Shunsuke Mie  wrote:
> >>> Add a new PCIe endpoint function driver that works as a pci virtio-console
> >>> device. The console connect to endpoint side console. It enables to
> >>> communicate PCIe host and endpoint.
> >>>
> >>> Architecture is following:
> >>>
> >>>   ┌┐ ┌──┬┐
> >>>   │virtioe │ │  │virtio  │
> >>>   │console drv │ ├───┐  │console drv │
> >>>   ├┤ │(virtio console│  ├┤
> >>>   │ virtio bus │ │ device)   │◄►│ virtio bus │
> >>>   ├┤ ├---┤  └┤
> >>>   ││ │ pci ep virtio │   │
> >>>   │  pci bus   │ │  console drv  │   │
> >>>   ││  pcie   ├───┤   │
> >>>   ││ ◄─► │  pci ep Bus   │   │
> >>>   └┘ └───┴───┘
> >>> PCIe Root  PCIe Endpoint
> >>>
> >> I think it might only works for peer devices like:
> >>
> >> net, console or vsock.
> > Could you tell me what "peer devices" means?

I meant, for example we know in the case of virtio-net, TX can talk
with RX belonging to another device directly.

But this is not the case for other devices like virito-blk.

> >
> >> So there're many choices here, I'd like to know what's the reason for
> >> you to implement a mediation.
> >>
> >> An alternative is to implement a dedicated net, console and vsock
> >> driver for vringh (CAIF somehow works like this). This would have
> >> better performance.
> > Does it mean that the driver also functions as a network driver directly?

I meant, e.g in the case of networking, you can have a dedicated
driver with two vringh in the endpoint side.

The benefit is the performance, no need for the (datapath) mediation.

But if we don't care about the performance, this proposal seems to be fine.

Thanks

> >>
> >>> This driver has two roles. The first is as a PCIe endpoint virtio console
> >>> function, which is implemented using the PCIe endpoint framework and PCIe
> >>> EP virtio helpers. The second is as a virtual virtio console device
> >>> connected to the virtio bus on PCIe endpoint Linux.
> >>>
> >>> Communication between the two is achieved by copying the virtqueue data
> >>> between PCIe root and endpoint, respectively.
> >>>
> >>> This is a simple implementation and does not include features of
> >>> virtio-console such as MULTIPORT, EMERG_WRITE, etc. As a result, each
> >>> virtio console driver only displays /dev/hvc0.
> >>>
> >>> As an example of usage, by setting getty to /dev/hvc0, it is possible to
> >>> login to another host.
> >>>
> >>> Signed-off-by: Shunsuke Mie 
> >>> ---
> >>> Changes from v2:
> >>> - Change to use copy functions between kiovs of pci-epf-virtio.
> >>>
> >>>   drivers/pci/endpoint/functions/Kconfig|  12 +
> >>>   drivers/pci/endpoint/functions/Makefile   |   1 +
> >>>   drivers/pci/endpoint/functions/pci-epf-vcon.c | 596 ++
> >>>   3 files changed, 609 insertions(+)
> >>>   create mode 100644 drivers/pci/endpoint/functions/pci-epf-vcon.c
> >>>
> >>> diff --git a/drivers/pci/endpoint/functions/Kconfig 
> >>> b/drivers/pci/endpoint/functions/Kconfig
> >>> index fa1a6a569a8f..9ce2698b67e1 100644
> >>> --- a/drivers/pci/endpoint/functions/Kconfig
> >>> +++ b/drivers/pci/endpoint/functions/Kconfig
> >>> @@ -44,3 +44,15 @@ config PCI_EPF_VIRTIO
> >>>  select VHOST_RING_IOMEM
> >>>  help
> >>>Helpers to implement PCI virtio Endpoint function
> >>> +
> >>> +config PCI_EPF_VCON
> >>> +   tristate "PCI Endpoint virito-console driver"
> >>> +   depends on PCI_ENDPOINT
> >>> +   select VHOST_RING
> 

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-18 Thread Jason Wang


在 2023/5/18 17:49, Michael S. Tsirkin 写道:

On Thu, May 18, 2023 at 05:14:03PM +0800, Xuan Zhuo wrote:

On Thu, 18 May 2023 16:57:37 +0800, Jason Wang  wrote:

On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin  wrote:

On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote:

On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin"  
wrote:

On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote:

On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo  wrote:

virtqueue_add_split() only supports virtual addresses, dma is completed
in virtqueue_add_split().

In some scenarios (such as the AF_XDP scenario), the memory is allocated
and DMA is completed in advance, so it is necessary for us to support
passing the DMA address to virtqueue_add_split().

Record this information in desc_state, we can skip unmap based on this
when executing dma unmap.

I would also suggest documenting why a per descriptor metadata is
needed instead of a per virtqueue one.

I think we could make it per virtqueue. That would mean all code in
virtio net would have to change to do dma mapping itself instead of
relying on virtio core though.  Which is maybe a good idea? Definitely a
very intrusive change though, will need a lot of performance testing
to make sure we don't break anything.

In fact, we have tried this idea.

The problem is the detach and unmap.

We need to get all DMA Addresses from virtio-ring to unmap. Currently, it does
not support to return the DMA Address, and for SKB, we need to get multiple DMA
Addresses at one time.

This need to modify the logic of Virtio-Ring detach. Besides this, I also agree
with this idea.

Thanks.

Well you can have a version of get_buf that returns them ... but
it is not clear to me all this is worth it unless you want
to do unsafe tricks like leaving them mapped.

Some high speed NIC drivers use this trick for better performance.


Interesting, this is the first time I know this. Is there any problem?

depends - if you are relying on the IOMMU then yes - malicious hardware
can steal guest secrets or corrupt memory since it's a hack not properly
integrated with linux and there's no real control preventing linux from
reusing this memory for something unrelated.



The pages are pre-allocated/mapped buffers for RX. So it should be fine.

Thanks



If instead you are using something like bounce buffers then no, but OTOH
bounce buffers are already expensive so you might not see a lot
of benefit.


So, is that virtio-net master the operation of dma by itself the right way?

Thanks

I am fine with the approach taken for now. And look at reducing
cost of dma map/unmap later.




I'd leave that
for another day maybe.

For marking desc as premapped I think we can use a bit from
desc_extra->flags, either reusing one of NEXT,AVAIL,USED, or stealing
another one.

Probably.

Thanks









Signed-off-by: Xuan Zhuo 
---
  drivers/virtio/virtio_ring.c | 38 +++-
  1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e2fc50c05bec..bd5e84afab37 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -70,6 +70,7 @@
  struct vring_desc_state_split {
 void *data; /* Data for callback. */
 struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
+   bool premapped; /* DMA mapping is done by driver. */

Going back to the original discussion around where this should be
placed. I wonder if we can find a common place to store this since it
has nothing related to virtqueue layout. Maybe desc_extra? And it
would be even better if we can avoid stressing the cache like above.


  };

  struct vring_desc_state_packed {
@@ -356,8 +357,14 @@ static struct device *vring_dma_dev(const struct 
vring_virtqueue *vq)

  /* Map one sg entry. */
  static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
scatterlist *sg,
-   enum dma_data_direction direction, static 
dma_addr_t *addr)
+   enum dma_data_direction direction,
+   bool premapped, dma_addr_t *addr)

having things like:

int func(bool do)
{
if (!do)
 return;
}

is a hint that the check needs to be done by the caller?

And this change should work for both packed and split. I think we need
to squash the packed changes here.

Looking at how packed virtqueue uses this in this patch, I don't think
this patch can even be built. I will wait for a new version and
continue the review from there.

Thanks




  {
+   if (premapped) {
+   *addr = sg_dma_address(sg);
+   return 0;
+   }
+
 if (!vq->use_dma_api) {
 /*
  * If DMA is not used, KMSAN doesn't know that the scatterlist
@@ -445,7 +452,7 @@ static void vring_unmap_one_split_indirect(const struct 
vring_virtqueue *vq,
  }

  s

Re: [PATCH vhost v9 05/12] virtio_ring: split: virtqueue_add_split() support premapped

2023-05-18 Thread Jason Wang
On Fri, May 19, 2023 at 11:33 AM Xuan Zhuo  wrote:
>
> On Thu, 18 May 2023 13:12:49 -0400, "Michael S. Tsirkin"  
> wrote:
> > On Thu, May 18, 2023 at 08:22:14PM +0800, Xuan Zhuo wrote:
> > > On Thu, 18 May 2023 05:49:46 -0400, "Michael S. Tsirkin" 
> > >  wrote:
> > > > On Thu, May 18, 2023 at 05:14:03PM +0800, Xuan Zhuo wrote:
> > > > > On Thu, 18 May 2023 16:57:37 +0800, Jason Wang  
> > > > > wrote:
> > > > > > On Thu, May 18, 2023 at 4:29 PM Michael S. Tsirkin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Thu, May 18, 2023 at 03:33:52PM +0800, Xuan Zhuo wrote:
> > > > > > > > On Thu, 18 May 2023 03:11:25 -0400, "Michael S. Tsirkin" 
> > > > > > > >  wrote:
> > > > > > > > > On Thu, May 18, 2023 at 02:51:57PM +0800, Jason Wang wrote:
> > > > > > > > > > On Wed, May 17, 2023 at 10:23 AM Xuan Zhuo 
> > > > > > > > > >  wrote:
> > > > > > > > > > >
> > > > > > > > > > > virtqueue_add_split() only supports virtual addresses, 
> > > > > > > > > > > dma is completed
> > > > > > > > > > > in virtqueue_add_split().
> > > > > > > > > > >
> > > > > > > > > > > In some scenarios (such as the AF_XDP scenario), the 
> > > > > > > > > > > memory is allocated
> > > > > > > > > > > and DMA is completed in advance, so it is necessary for 
> > > > > > > > > > > us to support
> > > > > > > > > > > passing the DMA address to virtqueue_add_split().
> > > > > > > > > > >
> > > > > > > > > > > Record this information in desc_state, we can skip unmap 
> > > > > > > > > > > based on this
> > > > > > > > > > > when executing dma unmap.
> > > > > > > > > >
> > > > > > > > > > I would also suggest documenting why a per descriptor 
> > > > > > > > > > metadata is
> > > > > > > > > > needed instead of a per virtqueue one.
> > > > > > > > >
> > > > > > > > > I think we could make it per virtqueue. That would mean all 
> > > > > > > > > code in
> > > > > > > > > virtio net would have to change to do dma mapping itself 
> > > > > > > > > instead of
> > > > > > > > > relying on virtio core though.  Which is maybe a good idea? 
> > > > > > > > > Definitely a
> > > > > > > > > very intrusive change though, will need a lot of performance 
> > > > > > > > > testing
> > > > > > > > > to make sure we don't break anything.
> > > > > > > >
> > > > > > > > In fact, we have tried this idea.
> > > > > > > >
> > > > > > > > The problem is the detach and unmap.
> > > > > > > >
> > > > > > > > We need to get all DMA Addresses from virtio-ring to unmap. 
> > > > > > > > Currently, it does
> > > > > > > > not support to return the DMA Address, and for SKB, we need to 
> > > > > > > > get multiple DMA
> > > > > > > > Addresses at one time.
> > > > > > > >
> > > > > > > > This need to modify the logic of Virtio-Ring detach. Besides 
> > > > > > > > this, I also agree
> > > > > > > > with this idea.
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > >
> > > > > > > Well you can have a version of get_buf that returns them ... but
> > > > > > > it is not clear to me all this is worth it unless you want
> > > > > > > to do unsafe tricks like leaving them mapped.
> > > > > >
> > > > > > Some high speed NIC drivers use this trick for better performance.
> > > > >
> > > > >
> > > > > Interesting, this is the first time I know this. Is there any problem?
> &

Re: [PATCH V2 1/5] vDPA/ifcvf: virt queue ops take immediate actions

2023-05-23 Thread Jason Wang
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
>
> In this commit, virtqueue operations including:
> set_vq_num(), set_vq_address(), set_vq_ready()
> and get_vq_ready() access PCI registers directly
> to take immediate actions.
>
> Signed-off-by: Zhu Lingshan 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 58 -
>  drivers/vdpa/ifcvf/ifcvf_base.h | 10 +++---
>  drivers/vdpa/ifcvf/ifcvf_main.c | 16 +++--
>  3 files changed, 45 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 5563b3a773c7..6c5650f73007 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -329,31 +329,49 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, 
> u16 num)
> return 0;
>  }
>
> -static int ifcvf_hw_enable(struct ifcvf_hw *hw)
> +void ifcvf_set_vq_num(struct ifcvf_hw *hw, u16 qid, u32 num)
>  {
> -   struct virtio_pci_common_cfg __iomem *cfg;
> -   u32 i;
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
>
> -   cfg = hw->common_cfg;
> -   for (i = 0; i < hw->nr_vring; i++) {
> -   if (!hw->vring[i].ready)
> -   break;
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite16(num, &cfg->queue_size);
> +}
>
> -   vp_iowrite16(i, &cfg->queue_select);
> -   vp_iowrite64_twopart(hw->vring[i].desc, &cfg->queue_desc_lo,
> -&cfg->queue_desc_hi);
> -   vp_iowrite64_twopart(hw->vring[i].avail, &cfg->queue_avail_lo,
> - &cfg->queue_avail_hi);
> -   vp_iowrite64_twopart(hw->vring[i].used, &cfg->queue_used_lo,
> -&cfg->queue_used_hi);
> -   vp_iowrite16(hw->vring[i].size, &cfg->queue_size);
> -   ifcvf_set_vq_state(hw, i, hw->vring[i].last_avail_idx);
> -   vp_iowrite16(1, &cfg->queue_enable);
> -   }
> +int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, u64 desc_area,
> +u64 driver_area, u64 device_area)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite64_twopart(desc_area, &cfg->queue_desc_lo,
> +&cfg->queue_desc_hi);
> +   vp_iowrite64_twopart(driver_area, &cfg->queue_avail_lo,
> +&cfg->queue_avail_hi);
> +   vp_iowrite64_twopart(device_area, &cfg->queue_used_lo,
> +&cfg->queue_used_hi);
>
> return 0;
>  }
>
> +bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +   u16 queue_enable;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   queue_enable = vp_ioread16(&cfg->queue_enable);
> +
> +   return (bool)queue_enable;
> +}
> +
> +void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +
> +   vp_iowrite16(qid, &cfg->queue_select);
> +   vp_iowrite16(ready, &cfg->queue_enable);
> +}
> +
>  static void ifcvf_hw_disable(struct ifcvf_hw *hw)
>  {
> u32 i;
> @@ -366,16 +384,12 @@ static void ifcvf_hw_disable(struct ifcvf_hw *hw)
>
>  int ifcvf_start_hw(struct ifcvf_hw *hw)
>  {
> -   ifcvf_reset(hw);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
>
> if (ifcvf_config_features(hw) < 0)
> return -EINVAL;
>
> -   if (ifcvf_hw_enable(hw) < 0)
> -   return -EINVAL;
> -
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
>
> return 0;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index c20d1c40214e..d545a9411143 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -47,12 +47,7 @@
>  #define MSIX_VECTOR_DEV_SHARED 3
>
>  struct vring_info {
> -   u64 desc;
> -   u64 avail;
> -   u64 used;
> -   u16 size;
> u16 last_avail_idx;
> -   bool ready;
> void __iomem *notify_addr;
> phys_addr_t notify_pa;
> u32 irq;
> @@ -137,4 +132,9 @@ int ifcvf_probed_virtio_

Re: [PATCH V2 2/5] vDPA/ifcvf: get_driver_features from virtio registers

2023-05-23 Thread Jason Wang
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
>
> This commit implements a new function ifcvf_get_driver_feature()
> which read driver_features from virtio registers.
>
> To be less ambiguous, ifcvf_set_features() is renamed to
> ifcvf_set_driver_features(), and ifcvf_get_features()
> is renamed to ifcvf_get_dev_features() which returns
> the provisioned vDPA device features.
>
> Signed-off-by: Zhu Lingshan 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 38 +
>  drivers/vdpa/ifcvf/ifcvf_base.h |  5 +++--
>  drivers/vdpa/ifcvf/ifcvf_main.c |  9 +---
>  3 files changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 6c5650f73007..546e923bcd16 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -204,11 +204,29 @@ u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> return features;
>  }
>
> -u64 ifcvf_get_features(struct ifcvf_hw *hw)
> +/* return provisioned vDPA dev features */
> +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw)
>  {
> return hw->dev_features;
>  }
>
> +u64 ifcvf_get_driver_features(struct ifcvf_hw *hw)
> +{
> +   struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
> +   u32 features_lo, features_hi;
> +   u64 features;
> +
> +   vp_iowrite32(0, &cfg->device_feature_select);
> +   features_lo = vp_ioread32(&cfg->guest_feature);
> +
> +   vp_iowrite32(1, &cfg->device_feature_select);
> +   features_hi = vp_ioread32(&cfg->guest_feature);
> +
> +   features = ((u64)features_hi << 32) | features_lo;
> +
> +   return features;
> +}
> +
>  int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features)
>  {
> if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) {
> @@ -275,7 +293,7 @@ void ifcvf_write_dev_config(struct ifcvf_hw *hw, u64 
> offset,
> vp_iowrite8(*p++, hw->dev_cfg + offset + i);
>  }
>
> -static void ifcvf_set_features(struct ifcvf_hw *hw, u64 features)
> +void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features)
>  {
> struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg;
>
> @@ -286,19 +304,6 @@ static void ifcvf_set_features(struct ifcvf_hw *hw, u64 
> features)
> vp_iowrite32(features >> 32, &cfg->guest_feature);
>  }
>
> -static int ifcvf_config_features(struct ifcvf_hw *hw)
> -{
> -   ifcvf_set_features(hw, hw->req_features);
> -   ifcvf_add_status(hw, VIRTIO_CONFIG_S_FEATURES_OK);
> -
> -   if (!(ifcvf_get_status(hw) & VIRTIO_CONFIG_S_FEATURES_OK)) {
> -   IFCVF_ERR(hw->pdev, "Failed to set FEATURES_OK status\n");
> -   return -EIO;
> -   }
> -
> -   return 0;
> -}
> -
>  u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
>  {
> struct ifcvf_lm_cfg __iomem *ifcvf_lm;
> @@ -387,9 +392,6 @@ int ifcvf_start_hw(struct ifcvf_hw *hw)
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER);
>
> -   if (ifcvf_config_features(hw) < 0)
> -   return -EINVAL;
> -
> ifcvf_add_status(hw, VIRTIO_CONFIG_S_DRIVER_OK);
>
> return 0;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index d545a9411143..cb19196c3ece 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -69,7 +69,6 @@ struct ifcvf_hw {
> phys_addr_t notify_base_pa;
> u32 notify_off_multiplier;
> u32 dev_type;
> -   u64 req_features;
> u64 hw_features;
> /* provisioned device features */
> u64 dev_features;
> @@ -122,7 +121,7 @@ u8 ifcvf_get_status(struct ifcvf_hw *hw);
>  void ifcvf_set_status(struct ifcvf_hw *hw, u8 status);
>  void io_write64_twopart(u64 val, u32 *lo, u32 *hi);
>  void ifcvf_reset(struct ifcvf_hw *hw);
> -u64 ifcvf_get_features(struct ifcvf_hw *hw);
> +u64 ifcvf_get_dev_features(struct ifcvf_hw *hw);
>  u64 ifcvf_get_hw_features(struct ifcvf_hw *hw);
>  int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features);
>  u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid);
> @@ -137,4 +136,6 @@ int ifcvf_set_vq_address(struct ifcvf_hw *hw, u16 qid, 
> u64 desc_area,
>  u64 driver_area, u64 device_area);
>  bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid);
>  void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready);
> +void ifcvf_set_driver_features(struct ifc

Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine

2023-05-24 Thread Jason Wang
On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
>
> This commit synchronize irqs of the virtqueues
> and config space in the reset routine.
> Thus ifcvf_stop_hw() and reset() are refactored as well.
>
> Signed-off-by: Zhu Lingshan 
> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 41 +
>  drivers/vdpa/ifcvf/ifcvf_base.h |  1 +
>  drivers/vdpa/ifcvf/ifcvf_main.c | 46 +
>  3 files changed, 38 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 79e313c5e10e..1f39290baa38 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
>
>  void ifcvf_reset(struct ifcvf_hw *hw)
>  {
> -   hw->config_cb.callback = NULL;
> -   hw->config_cb.private = NULL;
> -
> ifcvf_set_status(hw, 0);
> -   /* flush set_status, make sure VF is stopped, reset */
> -   ifcvf_get_status(hw);
> +   while (ifcvf_get_status(hw))
> +   msleep(1);
>  }
>
>  u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, 
> bool ready)
> vp_iowrite16(ready, &cfg->queue_enable);
>  }
>
> -static void ifcvf_hw_disable(struct ifcvf_hw *hw)
> +static void ifcvf_reset_vring(struct ifcvf_hw *hw)
>  {
> -   u32 i;
> +   u16 qid;
> +
> +   for (qid = 0; qid < hw->nr_vring; qid++) {
> +   hw->vring[qid].cb.callback = NULL;
> +   hw->vring[qid].cb.private = NULL;
> +   ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR);
> +   }
> +}
>
> +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw)
> +{
> +   hw->config_cb.callback = NULL;
> +   hw->config_cb.private = NULL;
> ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR);
> -   for (i = 0; i < hw->nr_vring; i++) {
> -   ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR);
> +}
> +
> +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw)
> +{
> +   u32 nvectors = hw->num_msix_vectors;
> +   struct pci_dev *pdev = hw->pdev;
> +   int i, irq;
> +
> +   for (i = 0; i < nvectors; i++) {
> +   irq = pci_irq_vector(pdev, i);
> +   if (irq >= 0)
> +   synchronize_irq(irq);
> }
>  }
>
>  void ifcvf_stop_hw(struct ifcvf_hw *hw)
>  {
> -   ifcvf_hw_disable(hw);
> -   ifcvf_reset(hw);
> +   ifcvf_synchronize_irq(hw);
> +   ifcvf_reset_vring(hw);
> +   ifcvf_reset_config_handler(hw);

Nit:

So the name of this function is kind of misleading since irq
synchronization and virtqueue/config handler are not belong to
hardware?

Maybe it would be better to call it ifcvf_stop().

Thanks

>  }
>
>  void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index d34d3bc0dbf4..7430f80779be 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -82,6 +82,7 @@ struct ifcvf_hw {
> int vqs_reused_irq;
> u16 nr_vring;
> /* VIRTIO_PCI_CAP_DEVICE_CFG size */
> +   u32 num_msix_vectors;
> u32 cap_dev_config_size;
> struct pci_dev *pdev;
>  };
> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
> index 968687159e44..3401b9901dd2 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> @@ -125,6 +125,7 @@ static void ifcvf_free_irq(struct ifcvf_hw *vf)
> ifcvf_free_vq_irq(vf);
> ifcvf_free_config_irq(vf);
> ifcvf_free_irq_vectors(pdev);
> +   vf->num_msix_vectors = 0;
>  }
>
>  /* ifcvf MSIX vectors allocator, this helper tries to allocate
> @@ -343,36 +344,11 @@ static int ifcvf_request_irq(struct ifcvf_hw *vf)
> if (ret)
> return ret;
>
> -   return 0;
> -}
> -
> -static int ifcvf_stop_datapath(struct ifcvf_adapter *adapter)
> -{
> -   struct ifcvf_hw *vf = adapter->vf;
> -   int i;
> -
> -   for (i = 0; i < vf->nr_vring; i++)
> -   vf->vring[i].cb.callback = NULL;
> -
> -   ifcvf_stop_hw(vf);
> +   vf->num_msix_vectors = nvectors;
>
> return 0;
>  }
>
> -static void ifcvf_reset_vring(struct ifcvf_adapter *adapter)
> -{
> -   struct ifcvf_hw *vf = adapter->vf;
> -   int i;
> -
> -   for (i = 0; i < vf->nr_vring; i++) {
> -   vf->vring[i].last_avail_idx = 0;
> -   vf->vring[i].cb.callback = NULL;
> -   vf->vring[i].cb.private = NULL;
> -   }
> -
> -   ifcvf_reset(vf);
> -}
> -
>  static struct ifcvf_adapter *vdpa_to_adapter(struct vdpa_device *vdpa_dev)
>  {
> return container_of(vdpa_dev, struct ifcvf_adapter, vdpa);
> @@ -462,23 +438,15 @@ static void ifcvf_vdpa_set_status(struct vdpa_device 
> *vdpa_dev, u8 status)
>
>  static int ifcvf_vdpa_reset(stru

[PATCH V3 net-next 0/2] virtio-net: don't busy poll for cvq command

2023-05-24 Thread Jason Wang
Hi all:

The code used to busy poll for cvq command which turns out to have
several side effects:

1) infinite poll for buggy devices
2) bad interaction with scheduler

So this series tries to use cond_resched() in the waiting loop. Before
doing this we need first make sure the cvq command is not executed in
atomic environment, so we need first convert rx mode handling to a
workqueue.

Please review.

Thanks

Changes since V2:

- Don't use interrupt but cond_resched()

Changes since V1:

- use RTNL to synchronize rx mode worker
- use completion for simplicity
- don't try to harden CVQ command

Changes since RFC:

- switch to use BAD_RING in virtio_break_device()
- check virtqueue_is_broken() after being woken up
- use more_used() instead of virtqueue_get_buf() to allow caller to
  get buffers afterwards
  - break the virtio-net device when timeout
  - get buffer manually since the virtio core check more_used() instead

Jason Wang (2):
  virtio-net: convert rx mode setting to use workqueue
  virtio-net: add cond_resched() to the command waiting loop

 drivers/net/virtio_net.c | 59 +---
 1 file changed, 55 insertions(+), 4 deletions(-)

-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-24 Thread Jason Wang
This patch convert rx mode setting to be done in a workqueue, this is
a must for allow to sleep when waiting for the cvq command to
response since current code is executed under addr spin lock.

Signed-off-by: Jason Wang 
---
Changes since V1:
- use RTNL to synchronize rx mode worker
---
 drivers/net/virtio_net.c | 55 +---
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 56ca1d270304..5d2f1da4eaa0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -265,6 +265,12 @@ struct virtnet_info {
/* Work struct for config space updates */
struct work_struct config_work;
 
+   /* Work struct for config rx mode */
+   struct work_struct rx_mode_work;
+
+   /* Is rx mode work enabled? */
+   bool rx_mode_work_enabled;
+
/* Does the affinity hint is set for virtqueues? */
bool affinity_hint_set;
 
@@ -388,6 +394,20 @@ static void disable_delayed_refill(struct virtnet_info *vi)
spin_unlock_bh(&vi->refill_lock);
 }
 
+static void enable_rx_mode_work(struct virtnet_info *vi)
+{
+   rtnl_lock();
+   vi->rx_mode_work_enabled = true;
+   rtnl_unlock();
+}
+
+static void disable_rx_mode_work(struct virtnet_info *vi)
+{
+   rtnl_lock();
+   vi->rx_mode_work_enabled = false;
+   rtnl_unlock();
+}
+
 static void virtqueue_napi_schedule(struct napi_struct *napi,
struct virtqueue *vq)
 {
@@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev)
return 0;
 }
 
-static void virtnet_set_rx_mode(struct net_device *dev)
+static void virtnet_rx_mode_work(struct work_struct *work)
 {
-   struct virtnet_info *vi = netdev_priv(dev);
+   struct virtnet_info *vi =
+   container_of(work, struct virtnet_info, rx_mode_work);
+   struct net_device *dev = vi->dev;
struct scatterlist sg[2];
struct virtio_net_ctrl_mac *mac_data;
struct netdev_hw_addr *ha;
@@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
return;
 
+   rtnl_lock();
+
vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
 
@@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct net_device *dev)
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
 vi->ctrl->allmulti ? "en" : "dis");
 
+   netif_addr_lock_bh(dev);
+
uc_count = netdev_uc_count(dev);
mc_count = netdev_mc_count(dev);
/* MAC filter - use one buffer for both lists */
buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
  (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
mac_data = buf;
-   if (!buf)
+   if (!buf) {
+   netif_addr_unlock_bh(dev);
+   rtnl_unlock();
return;
+   }
 
sg_init_table(sg, 2);
 
@@ -2401,6 +2430,8 @@ static void virtnet_set_rx_mode(struct net_device *dev)
netdev_for_each_mc_addr(ha, dev)
memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
 
+   netif_addr_unlock_bh(dev);
+
sg_set_buf(&sg[1], mac_data,
   sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
 
@@ -2408,9 +2439,19 @@ static void virtnet_set_rx_mode(struct net_device *dev)
  VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
 
+   rtnl_unlock();
+
kfree(buf);
 }
 
+static void virtnet_set_rx_mode(struct net_device *dev)
+{
+   struct virtnet_info *vi = netdev_priv(dev);
+
+   if (vi->rx_mode_work_enabled)
+   schedule_work(&vi->rx_mode_work);
+}
+
 static int virtnet_vlan_rx_add_vid(struct net_device *dev,
   __be16 proto, u16 vid)
 {
@@ -3181,6 +3222,8 @@ static void virtnet_freeze_down(struct virtio_device 
*vdev)
 
/* Make sure no work handler is accessing the device */
flush_work(&vi->config_work);
+   disable_rx_mode_work(vi);
+   flush_work(&vi->rx_mode_work);
 
netif_tx_lock_bh(vi->dev);
netif_device_detach(vi->dev);
@@ -3203,6 +3246,7 @@ static int virtnet_restore_up(struct virtio_device *vdev)
virtio_device_ready(vdev);
 
enable_delayed_refill(vi);
+   enable_rx_mode_work(vi);
 
if (netif_running(vi->dev)) {
err = virtnet_open(vi->dev);
@@ -4002,6 +4046,7 @@ static int virtnet_probe(struct virtio_device *vdev)
vdev->priv = vi;
 
INIT_WORK(&vi->config_work, virtnet_config_changed_work);
+  

[PATCH V3 net-next 2/2] virtio-net: add cond_resched() to the command waiting loop

2023-05-24 Thread Jason Wang
Adding cond_resched() to the command waiting loop for a better
co-operation with the scheduler. This allows to give CPU a breath to
run other task(workqueue) instead of busy looping when preemption is
not allowed on a device whose CVQ might be slow.

Signed-off-by: Jason Wang 
---
 drivers/net/virtio_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5d2f1da4eaa0..de498dbbf0d4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2207,8 +2207,10 @@ static bool virtnet_send_command(struct virtnet_info 
*vi, u8 class, u8 cmd,
 * into the hypervisor, so the request should be handled immediately.
 */
while (!virtqueue_get_buf(vi->cvq, &tmp) &&
-  !virtqueue_is_broken(vi->cvq))
+  !virtqueue_is_broken(vi->cvq)) {
+   cond_resched();
cpu_relax();
+   }
 
return vi->ctrl->status == VIRTIO_NET_OK;
 }
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine

2023-05-24 Thread Jason Wang
On Wed, May 24, 2023 at 4:03 PM Jason Wang  wrote:
>
> On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
> >
> > This commit synchronize irqs of the virtqueues
> > and config space in the reset routine.
> > Thus ifcvf_stop_hw() and reset() are refactored as well.
> >
> > Signed-off-by: Zhu Lingshan 
> > ---
> >  drivers/vdpa/ifcvf/ifcvf_base.c | 41 +
> >  drivers/vdpa/ifcvf/ifcvf_base.h |  1 +
> >  drivers/vdpa/ifcvf/ifcvf_main.c | 46 +
> >  3 files changed, 38 insertions(+), 50 deletions(-)
> >
> > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c 
> > b/drivers/vdpa/ifcvf/ifcvf_base.c
> > index 79e313c5e10e..1f39290baa38 100644
> > --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> > +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> > @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
> >
> >  void ifcvf_reset(struct ifcvf_hw *hw)
> >  {
> > -   hw->config_cb.callback = NULL;
> > -   hw->config_cb.private = NULL;
> > -
> > ifcvf_set_status(hw, 0);
> > -   /* flush set_status, make sure VF is stopped, reset */
> > -   ifcvf_get_status(hw);
> > +   while (ifcvf_get_status(hw))
> > +   msleep(1);
> >  }
> >
> >  u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> > @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, 
> > bool ready)
> > vp_iowrite16(ready, &cfg->queue_enable);
> >  }
> >
> > -static void ifcvf_hw_disable(struct ifcvf_hw *hw)
> > +static void ifcvf_reset_vring(struct ifcvf_hw *hw)
> >  {
> > -   u32 i;
> > +   u16 qid;
> > +
> > +   for (qid = 0; qid < hw->nr_vring; qid++) {
> > +   hw->vring[qid].cb.callback = NULL;
> > +   hw->vring[qid].cb.private = NULL;
> > +   ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR);
> > +   }
> > +}
> >
> > +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw)
> > +{
> > +   hw->config_cb.callback = NULL;
> > +   hw->config_cb.private = NULL;
> > ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR);
> > -   for (i = 0; i < hw->nr_vring; i++) {
> > -   ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR);
> > +}
> > +
> > +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw)
> > +{
> > +   u32 nvectors = hw->num_msix_vectors;
> > +   struct pci_dev *pdev = hw->pdev;
> > +   int i, irq;
> > +
> > +   for (i = 0; i < nvectors; i++) {
> > +   irq = pci_irq_vector(pdev, i);
> > +   if (irq >= 0)
> > +   synchronize_irq(irq);
> > }
> >  }
> >
> >  void ifcvf_stop_hw(struct ifcvf_hw *hw)
> >  {
> > -   ifcvf_hw_disable(hw);
> > -   ifcvf_reset(hw);
> > +   ifcvf_synchronize_irq(hw);
> > +   ifcvf_reset_vring(hw);
> > +   ifcvf_reset_config_handler(hw);
>
> Nit:
>
> So the name of this function is kind of misleading since irq
> synchronization and virtqueue/config handler are not belong to
> hardware?
>
> Maybe it would be better to call it ifcvf_stop().

I think we can tweak this on top. So

Acked-by: Jason Wang 

Thanks

>
> Thanks
>
> >  }
> >
> >  void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
> > diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h 
> > b/drivers/vdpa/ifcvf/ifcvf_base.h
> > index d34d3bc0dbf4..7430f80779be 100644
> > --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> > +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> > @@ -82,6 +82,7 @@ struct ifcvf_hw {
> > int vqs_reused_irq;
> > u16 nr_vring;
> > /* VIRTIO_PCI_CAP_DEVICE_CFG size */
> > +   u32 num_msix_vectors;
> > u32 cap_dev_config_size;
> > struct pci_dev *pdev;
> >  };
> > diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c 
> > b/drivers/vdpa/ifcvf/ifcvf_main.c
> > index 968687159e44..3401b9901dd2 100644
> > --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> > +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> > @@ -125,6 +125,7 @@ static void ifcvf_free_irq(struct ifcvf_hw *vf)
> > ifcvf_free_vq_irq(vf);
> > ifcvf_free_config_irq(vf);
> > ifcvf_free_irq_vectors(pdev);
> > +   vf->num_msix_vectors = 0;
> >  }
> >
> >  /* ifcvf MSIX vectors allocator, this helper tries to allocate
> > @@ -343,36 +344,11 @@ static 

Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-24 Thread Jason Wang
On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin  wrote:
>
> On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote:
> > This patch convert rx mode setting to be done in a workqueue, this is
> > a must for allow to sleep when waiting for the cvq command to
> > response since current code is executed under addr spin lock.
> >
> > Signed-off-by: Jason Wang 
> > ---
> > Changes since V1:
> > - use RTNL to synchronize rx mode worker
> > ---
> >  drivers/net/virtio_net.c | 55 +---
> >  1 file changed, 52 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 56ca1d270304..5d2f1da4eaa0 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -265,6 +265,12 @@ struct virtnet_info {
> >   /* Work struct for config space updates */
> >   struct work_struct config_work;
> >
> > + /* Work struct for config rx mode */
>
> With a bit less abbreviation maybe? setting rx mode?

That's fine.

>
> > + struct work_struct rx_mode_work;
> > +
> > + /* Is rx mode work enabled? */
>
> Ugh not a great comment.

Any suggestions for this. E.g we had:

/* Is delayed refill enabled? */

>
> > + bool rx_mode_work_enabled;
> > +
>
>
>
> >   /* Does the affinity hint is set for virtqueues? */
> >   bool affinity_hint_set;
> >
> > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct virtnet_info 
> > *vi)
> >   spin_unlock_bh(&vi->refill_lock);
> >  }
> >
> > +static void enable_rx_mode_work(struct virtnet_info *vi)
> > +{
> > + rtnl_lock();
> > + vi->rx_mode_work_enabled = true;
> > + rtnl_unlock();
> > +}
> > +
> > +static void disable_rx_mode_work(struct virtnet_info *vi)
> > +{
> > + rtnl_lock();
> > + vi->rx_mode_work_enabled = false;
> > + rtnl_unlock();
> > +}
> > +
> >  static void virtqueue_napi_schedule(struct napi_struct *napi,
> >   struct virtqueue *vq)
> >  {
> > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev)
> >   return 0;
> >  }
> >
> > -static void virtnet_set_rx_mode(struct net_device *dev)
> > +static void virtnet_rx_mode_work(struct work_struct *work)
> >  {
> > - struct virtnet_info *vi = netdev_priv(dev);
> > + struct virtnet_info *vi =
> > + container_of(work, struct virtnet_info, rx_mode_work);
> > + struct net_device *dev = vi->dev;
> >   struct scatterlist sg[2];
> >   struct virtio_net_ctrl_mac *mac_data;
> >   struct netdev_hw_addr *ha;
> > @@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device 
> > *dev)
> >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
> >   return;
> >
> > + rtnl_lock();
> > +
> >   vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
> >   vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
> >
> > @@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct net_device 
> > *dev)
> >   dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
> >vi->ctrl->allmulti ? "en" : "dis");
> >
> > + netif_addr_lock_bh(dev);
> > +
> >   uc_count = netdev_uc_count(dev);
> >   mc_count = netdev_mc_count(dev);
> >   /* MAC filter - use one buffer for both lists */
> >   buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
> > (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
> >   mac_data = buf;
> > - if (!buf)
> > + if (!buf) {
> > + netif_addr_unlock_bh(dev);
> > + rtnl_unlock();
> >   return;
> > + }
> >
> >   sg_init_table(sg, 2);
> >
> > @@ -2401,6 +2430,8 @@ static void virtnet_set_rx_mode(struct net_device 
> > *dev)
> >   netdev_for_each_mc_addr(ha, dev)
> >   memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
> >
> > + netif_addr_unlock_bh(dev);
> > +
> >   sg_set_buf(&sg[1], mac_data,
> >  sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
> >
> > @@ -2408,9 +2439,19 @@ static void virtnet_set_rx_mode(struct net_device 
> > *dev)
> > V

Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-25 Thread Jason Wang
On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin  wrote:
>
> On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote:
> > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote:
> > > > This patch convert rx mode setting to be done in a workqueue, this is
> > > > a must for allow to sleep when waiting for the cvq command to
> > > > response since current code is executed under addr spin lock.
> > > >
> > > > Signed-off-by: Jason Wang 
> > > > ---
> > > > Changes since V1:
> > > > - use RTNL to synchronize rx mode worker
> > > > ---
> > > >  drivers/net/virtio_net.c | 55 +---
> > > >  1 file changed, 52 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index 56ca1d270304..5d2f1da4eaa0 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -265,6 +265,12 @@ struct virtnet_info {
> > > >   /* Work struct for config space updates */
> > > >   struct work_struct config_work;
> > > >
> > > > + /* Work struct for config rx mode */
> > >
> > > With a bit less abbreviation maybe? setting rx mode?
> >
> > That's fine.
> >
> > >
> > > > + struct work_struct rx_mode_work;
> > > > +
> > > > + /* Is rx mode work enabled? */
> > >
> > > Ugh not a great comment.
> >
> > Any suggestions for this. E.g we had:
> >
> > /* Is delayed refill enabled? */
>
> /* OK to queue work setting RX mode? */

Ok.

>
>
> > >
> > > > + bool rx_mode_work_enabled;
> > > > +
> > >
> > >
> > >
> > > >   /* Does the affinity hint is set for virtqueues? */
> > > >   bool affinity_hint_set;
> > > >
> > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct 
> > > > virtnet_info *vi)
> > > >   spin_unlock_bh(&vi->refill_lock);
> > > >  }
> > > >
> > > > +static void enable_rx_mode_work(struct virtnet_info *vi)
> > > > +{
> > > > + rtnl_lock();
> > > > + vi->rx_mode_work_enabled = true;
> > > > + rtnl_unlock();
> > > > +}
> > > > +
> > > > +static void disable_rx_mode_work(struct virtnet_info *vi)
> > > > +{
> > > > + rtnl_lock();
> > > > + vi->rx_mode_work_enabled = false;
> > > > + rtnl_unlock();
> > > > +}
> > > > +
> > > >  static void virtqueue_napi_schedule(struct napi_struct *napi,
> > > >   struct virtqueue *vq)
> > > >  {
> > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device *dev)
> > > >   return 0;
> > > >  }
> > > >
> > > > -static void virtnet_set_rx_mode(struct net_device *dev)
> > > > +static void virtnet_rx_mode_work(struct work_struct *work)
> > > >  {
> > > > - struct virtnet_info *vi = netdev_priv(dev);
> > > > + struct virtnet_info *vi =
> > > > + container_of(work, struct virtnet_info, rx_mode_work);
> > > > + struct net_device *dev = vi->dev;
> > > >   struct scatterlist sg[2];
> > > >   struct virtio_net_ctrl_mac *mac_data;
> > > >   struct netdev_hw_addr *ha;
> > > > @@ -2356,6 +2378,8 @@ static void virtnet_set_rx_mode(struct net_device 
> > > > *dev)
> > > >   if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
> > > >   return;
> > > >
> > > > + rtnl_lock();
> > > > +
> > > >   vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
> > > >   vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
> > > >
> > > > @@ -2373,14 +2397,19 @@ static void virtnet_set_rx_mode(struct 
> > > > net_device *dev)
> > > >   dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
> > > >vi->ctrl->allmulti ? "en" : "dis");
> > > >
> > > > + netif_addr_lock_b

Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine

2023-05-25 Thread Jason Wang
On Thu, May 25, 2023 at 5:38 PM Zhu, Lingshan  wrote:
>
>
>
> On 5/24/2023 4:03 PM, Jason Wang wrote:
> > On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan  wrote:
> >> This commit synchronize irqs of the virtqueues
> >> and config space in the reset routine.
> >> Thus ifcvf_stop_hw() and reset() are refactored as well.
> >>
> >> Signed-off-by: Zhu Lingshan 
> >> ---
> >>   drivers/vdpa/ifcvf/ifcvf_base.c | 41 +
> >>   drivers/vdpa/ifcvf/ifcvf_base.h |  1 +
> >>   drivers/vdpa/ifcvf/ifcvf_main.c | 46 +
> >>   3 files changed, 38 insertions(+), 50 deletions(-)
> >>
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c 
> >> b/drivers/vdpa/ifcvf/ifcvf_base.c
> >> index 79e313c5e10e..1f39290baa38 100644
> >> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> >> @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status)
> >>
> >>   void ifcvf_reset(struct ifcvf_hw *hw)
> >>   {
> >> -   hw->config_cb.callback = NULL;
> >> -   hw->config_cb.private = NULL;
> >> -
> >>  ifcvf_set_status(hw, 0);
> >> -   /* flush set_status, make sure VF is stopped, reset */
> >> -   ifcvf_get_status(hw);
> >> +   while (ifcvf_get_status(hw))
> >> +   msleep(1);
> >>   }
> >>
> >>   u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> >> @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 
> >> qid, bool ready)
> >>  vp_iowrite16(ready, &cfg->queue_enable);
> >>   }
> >>
> >> -static void ifcvf_hw_disable(struct ifcvf_hw *hw)
> >> +static void ifcvf_reset_vring(struct ifcvf_hw *hw)
> >>   {
> >> -   u32 i;
> >> +   u16 qid;
> >> +
> >> +   for (qid = 0; qid < hw->nr_vring; qid++) {
> >> +   hw->vring[qid].cb.callback = NULL;
> >> +   hw->vring[qid].cb.private = NULL;
> >> +   ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR);
> >> +   }
> >> +}
> >>
> >> +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw)
> >> +{
> >> +   hw->config_cb.callback = NULL;
> >> +   hw->config_cb.private = NULL;
> >>  ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR);
> >> -   for (i = 0; i < hw->nr_vring; i++) {
> >> -   ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR);
> >> +}
> >> +
> >> +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw)
> >> +{
> >> +   u32 nvectors = hw->num_msix_vectors;
> >> +   struct pci_dev *pdev = hw->pdev;
> >> +   int i, irq;
> >> +
> >> +   for (i = 0; i < nvectors; i++) {
> >> +   irq = pci_irq_vector(pdev, i);
> >> +   if (irq >= 0)
> >> +   synchronize_irq(irq);
> >>  }
> >>   }
> >>
> >>   void ifcvf_stop_hw(struct ifcvf_hw *hw)
> >>   {
> >> -   ifcvf_hw_disable(hw);
> >> -   ifcvf_reset(hw);
> >> +   ifcvf_synchronize_irq(hw);
> >> +   ifcvf_reset_vring(hw);
> >> +   ifcvf_reset_config_handler(hw);
> > Nit:
> >
> > So the name of this function is kind of misleading since irq
> > synchronization and virtqueue/config handler are not belong to
> > hardware?
> >
> > Maybe it would be better to call it ifcvf_stop().
> Sure, I will send a V3 with this renaming,
> do you ack patch 1/5?

Yes, I think I've acked to that patch.

Thanks

>
> Thanks
> Zhu Lingshan
> >
> > Thanks
> >
> >>   }
> >>
> >>   void ifcvf_notify_queue(struct ifcvf_hw *hw, u16 qid)
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h 
> >> b/drivers/vdpa/ifcvf/ifcvf_base.h
> >> index d34d3bc0dbf4..7430f80779be 100644
> >> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> >> @@ -82,6 +82,7 @@ struct ifcvf_hw {
> >>  int vqs_reused_irq;
> >>  u16 nr_vring;
> >>  /* VIRTIO_PCI_CAP_DEVICE_CFG size */
> >> +   u32 num_msix_vectors;
> >>  u32 cap_dev_config_size;
> >>  struct pci_dev *pdev;
> >>   };
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c 

Re: [PATCH V2 4/5] vDPA/ifcvf: synchronize irqs in the reset routine

2023-05-25 Thread Jason Wang
On Fri, May 26, 2023 at 1:30 PM Zhu, Lingshan  wrote:
>
>
>
> On 5/26/2023 11:36 AM, Zhu, Lingshan wrote:
> >
> >
> > On 5/26/2023 9:34 AM, Jason Wang wrote:
> >> On Thu, May 25, 2023 at 5:38 PM Zhu, Lingshan
> >>  wrote:
> >>>
> >>>
> >>> On 5/24/2023 4:03 PM, Jason Wang wrote:
> >>>> On Mon, May 8, 2023 at 6:05 PM Zhu Lingshan
> >>>>  wrote:
> >>>>> This commit synchronize irqs of the virtqueues
> >>>>> and config space in the reset routine.
> >>>>> Thus ifcvf_stop_hw() and reset() are refactored as well.
> >>>>>
> >>>>> Signed-off-by: Zhu Lingshan 
> >>>>> ---
> >>>>>drivers/vdpa/ifcvf/ifcvf_base.c | 41 +
> >>>>>drivers/vdpa/ifcvf/ifcvf_base.h |  1 +
> >>>>>drivers/vdpa/ifcvf/ifcvf_main.c | 46
> >>>>> +
> >>>>>3 files changed, 38 insertions(+), 50 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c
> >>>>> b/drivers/vdpa/ifcvf/ifcvf_base.c
> >>>>> index 79e313c5e10e..1f39290baa38 100644
> >>>>> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> >>>>> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> >>>>> @@ -170,12 +170,9 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8
> >>>>> status)
> >>>>>
> >>>>>void ifcvf_reset(struct ifcvf_hw *hw)
> >>>>>{
> >>>>> -   hw->config_cb.callback = NULL;
> >>>>> -   hw->config_cb.private = NULL;
> >>>>> -
> >>>>>   ifcvf_set_status(hw, 0);
> >>>>> -   /* flush set_status, make sure VF is stopped, reset */
> >>>>> -   ifcvf_get_status(hw);
> >>>>> +   while (ifcvf_get_status(hw))
> >>>>> +   msleep(1);
> >>>>>}
> >>>>>
> >>>>>u64 ifcvf_get_hw_features(struct ifcvf_hw *hw)
> >>>>> @@ -368,20 +365,42 @@ void ifcvf_set_vq_ready(struct ifcvf_hw *hw,
> >>>>> u16 qid, bool ready)
> >>>>>   vp_iowrite16(ready, &cfg->queue_enable);
> >>>>>}
> >>>>>
> >>>>> -static void ifcvf_hw_disable(struct ifcvf_hw *hw)
> >>>>> +static void ifcvf_reset_vring(struct ifcvf_hw *hw)
> >>>>>{
> >>>>> -   u32 i;
> >>>>> +   u16 qid;
> >>>>> +
> >>>>> +   for (qid = 0; qid < hw->nr_vring; qid++) {
> >>>>> +   hw->vring[qid].cb.callback = NULL;
> >>>>> +   hw->vring[qid].cb.private = NULL;
> >>>>> +   ifcvf_set_vq_vector(hw, qid, VIRTIO_MSI_NO_VECTOR);
> >>>>> +   }
> >>>>> +}
> >>>>>
> >>>>> +static void ifcvf_reset_config_handler(struct ifcvf_hw *hw)
> >>>>> +{
> >>>>> +   hw->config_cb.callback = NULL;
> >>>>> +   hw->config_cb.private = NULL;
> >>>>>   ifcvf_set_config_vector(hw, VIRTIO_MSI_NO_VECTOR);
> >>>>> -   for (i = 0; i < hw->nr_vring; i++) {
> >>>>> -   ifcvf_set_vq_vector(hw, i, VIRTIO_MSI_NO_VECTOR);
> >>>>> +}
> >>>>> +
> >>>>> +static void ifcvf_synchronize_irq(struct ifcvf_hw *hw)
> >>>>> +{
> >>>>> +   u32 nvectors = hw->num_msix_vectors;
> >>>>> +   struct pci_dev *pdev = hw->pdev;
> >>>>> +   int i, irq;
> >>>>> +
> >>>>> +   for (i = 0; i < nvectors; i++) {
> >>>>> +   irq = pci_irq_vector(pdev, i);
> >>>>> +   if (irq >= 0)
> >>>>> +   synchronize_irq(irq);
> >>>>>   }
> >>>>>}
> >>>>>
> >>>>>void ifcvf_stop_hw(struct ifcvf_hw *hw)
> >>>>>{
> >>>>> -   ifcvf_hw_disable(hw);
> >>>>> -   ifcvf_reset(hw);
> >>>>> +   ifcvf_synchronize_irq(hw);
> >>>>> +   ifcvf_reset_vring(hw);
> >>&

[PATCH] virtio_ring: validate used buffer length

2023-05-25 Thread Jason Wang
This patch validate the used buffer length provided by the device
before trying to use it. This is done by remembering the in buffer
length in a dedicated array during virtqueue_add(), then we can fail
the virtqueue_get_buf() when we find the device is trying to give us a
used buffer length which is greater than we stored before.

This validation is disable by default via module parameter to unbreak
some existing devices since some legacy devices are known to report
buggy used length.

Signed-off-by: Jason Wang 
---
Changes since V4:
- drop the flat for driver to suppress the check
- validation is disabled by default
- don't do validation for legacy device
- rebase and support virtqueue resize
---
 drivers/virtio/virtio_ring.c | 75 
 1 file changed, 75 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 143f380baa1c..5b151605aaf8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -15,6 +15,9 @@
 #include 
 #include 
 
+static bool force_used_validation = false;
+module_param(force_used_validation, bool, 0444);
+
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)\
@@ -105,6 +108,9 @@ struct vring_virtqueue_split {
struct vring_desc_state_split *desc_state;
struct vring_desc_extra *desc_extra;
 
+   /* Maximum in buffer length, NULL means no used validation */
+   u32 *buflen;
+
/* DMA address and size information */
dma_addr_t queue_dma_addr;
size_t queue_size_in_bytes;
@@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
struct vring_desc_state_packed *desc_state;
struct vring_desc_extra *desc_extra;
 
+   /* Maximum in buffer length, NULL means no used validation */
+   u32 *buflen;
+
/* DMA address and size information */
dma_addr_t ring_dma_addr;
dma_addr_t driver_event_dma_addr;
@@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
unsigned int i, n, avail, descs_used, prev, err_idx;
int head;
bool indirect;
+   u32 buflen = 0;
 
START_USE(vq);
 
@@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 VRING_DESC_F_NEXT |
 VRING_DESC_F_WRITE,
 indirect);
+   buflen += sg->length;
}
}
/* Last one doesn't continue. */
@@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
else
vq->split.desc_state[head].indir_desc = ctx;
 
+   /* Store in buffer length if necessary */
+   if (vq->split.buflen)
+   vq->split.buflen[head] = buflen;
+
/* Put entry in available array (but don't update avail->idx until they
 * do sync). */
avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
@@ -861,6 +876,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue 
*_vq,
BAD_RING(vq, "id %u is not a head!\n", i);
return NULL;
}
+   if (vq->split.buflen && unlikely(*len > vq->split.buflen[i])) {
+   BAD_RING(vq, "used len %d is larger than max in buffer len 
%u\n",
+   *len, vq->split.buflen[i]);
+   return NULL;
+   }
 
/* detach_buf_split clears data, so grab it now. */
ret = vq->split.desc_state[i].data;
@@ -1085,10 +1105,25 @@ static void vring_free_split(struct 
vring_virtqueue_split *vring_split,
 vring_split->queue_dma_addr,
 dma_dev);
 
+   kfree(vring_split->buflen);
kfree(vring_split->desc_state);
kfree(vring_split->desc_extra);
 }
 
+static bool vring_needs_used_validation(const struct virtio_device *vdev)
+{
+   /*
+* Several legacy devices are known to produce buggy used
+* length. In order to let driver work, we won't validate used
+* buffer length in this case.
+*/
+   if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
+   return false;
+   if (force_used_validation)
+   return true;
+   return false;
+}
+
 static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
   struct virtio_device *vdev,
   u32 num,
@@ -1137,7 +1172,19 @@ static int vring_alloc_queue_split(struct 
vring_virtqueue_split *vring_split,
vring_split->vring_align = vring_align;
vring_split->may_reduce_num = may_reduce_num;
 
+   if (vring_needs_used_validation(vdev)) {
+ 

Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain

2023-05-25 Thread Jason Wang
On Fri, May 26, 2023 at 1:46 PM Liang Chen  wrote:
>
> "private" of buffer page is currently used for big mode to chain pages.
> But in mergeable mode, that offset of page could mean something else,
> e.g. when page_pool page is used instead. So excluding mergeable mode to
> avoid such a problem.

If this issue happens only in the case of page_pool, it would be
better to squash it there.

Thanks

>
> Signed-off-by: Liang Chen 
> ---
>  drivers/net/virtio_net.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 5a7f7a76b920..c5dca0d92e64 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
> return NULL;
>
> page = (struct page *)page->private;
> -   if (page)
> +   if (!vi->mergeable_rx_bufs && page)
> give_pages(rq, page);
> goto ok;
> }
> --
> 2.31.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance

2023-05-25 Thread Jason Wang
On Fri, May 26, 2023 at 1:46 PM Liang Chen  wrote:
>
> The implementation at the moment uses one page per packet in both the
> normal and XDP path.

It's better to explain why we need a page pool and how it can help the
performance.

> In addition, introducing a module parameter to enable
> or disable the usage of page pool (disabled by default).

If page pool wins for most of the cases, any reason to disable it by default?

>
> In single-core vm testing environments, it gives a modest performance gain
> in the normal path.
>   Upstream codebase: 47.5 Gbits/sec
>   Upstream codebase + page_pool support: 50.2 Gbits/sec
>
> In multi-core vm testing environments, The most significant performance
> gain is observed in XDP cpumap:
>   Upstream codebase: 1.38 Gbits/sec
>   Upstream codebase + page_pool support: 9.74 Gbits/sec

Please show more details on the test. E.g which kinds of tests have
you measured?

Btw, it would be better to measure PPS as well.

>
> With this foundation, we can further integrate page pool fragmentation and
> DMA map/unmap support.
>
> Signed-off-by: Liang Chen 
> ---
>  drivers/net/virtio_net.c | 188 ++-

I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
the ifdef tricks at least.

>  1 file changed, 146 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c5dca0d92e64..99c0ca0c1781 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
>
> +static bool page_pool_enabled;
> +module_param(page_pool_enabled, bool, 0400);
> +
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN  128
> @@ -159,6 +162,9 @@ struct receive_queue {
> /* Chain pages by the private ptr. */
> struct page *pages;
>
> +   /* Page pool */
> +   struct page_pool *page_pool;
> +
> /* Average packet length for mergeable receive buffers. */
> struct ewma_pkt_len mrg_avg_pkt_len;
>
> @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, 
> unsigned int buflen,
> return skb;
>  }
>
> +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> +{
> +   if (rq->page_pool)
> +   page_pool_put_full_page(rq->page_pool, page, true);
> +   else
> +   put_page(page);
> +}
> +
>  /* Called from bottom half context */
>  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>struct receive_queue *rq,
> @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
> hdr = skb_vnet_hdr(skb);
> memcpy(hdr, hdr_p, hdr_len);
> if (page_to_free)
> -   put_page(page_to_free);
> +   virtnet_put_page(rq, page_to_free);
>
> return skb;
>  }
> @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> return ret;
>  }
>
> -static void put_xdp_frags(struct xdp_buff *xdp)
> +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
>  {

rq could be fetched from xdp_rxq_info?

> struct skb_shared_info *shinfo;
> struct page *xdp_page;
> @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> shinfo = xdp_get_shared_info_from_buff(xdp);
> for (i = 0; i < shinfo->nr_frags; i++) {
> xdp_page = skb_frag_page(&shinfo->frags[i]);
> -   put_page(xdp_page);
> +   virtnet_put_page(rq, xdp_page);
> }
> }
>  }
> @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct 
> receive_queue *rq,
> if (page_off + *len + tailroom > PAGE_SIZE)
> return NULL;
>
> -   page = alloc_page(GFP_ATOMIC);
> +   if (rq->page_pool)
> +   page = page_pool_dev_alloc_pages(rq->page_pool);
> +   else
> +   page = alloc_page(GFP_ATOMIC);
> +
> if (!page)
> return NULL;
>
> @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct 
> receive_queue *rq,
>  * is sending packet larger than the MTU.
>  */
> if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> -   put_page(p);
> +   virtnet_put_page(rq, p);
> goto err_buf;
> }
>
> memcpy(page_address(page) + page_off,
>page_address(p) + off, buflen);
> page_off += buflen;
> -   put_page(p);
> +   virtnet_put_page(rq, p);
> }
>
> /* Headroom does not contribute to packet length */
> *len = page_off - VIRTIO_XDP_HEADROOM;
> return page;
>  err_buf:
> -   __free_pages

Re: [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler

2023-05-25 Thread Jason Wang
On Fri, May 26, 2023 at 1:47 PM Liang Chen  wrote:
>
> Currently, DMA operations of virtio devices' data buffer are encapsulated
> within the underlying virtqueue implementation. DMA map/unmap operations
> are performed for each data buffer attached to/detached from the virtqueue,
> which is transparent and invisible to the higher-level virtio device
> drivers. This encapsulation makes it not viable for device drivers to
> introduce certain mechanisms, such as page pool, that require explicit
> management of DMA map/unmap. Therefore, by inserting a pre-handler before
> the generic DMA map/unmap operations, virtio device drivers have the
> opportunity to participate in DMA operations.
>
> Signed-off-by: Liang Chen 

So Xuan is doing AF_XDP for the virtio-net that allows the DMA to be
mapped at least by the virtio-net.

It looks like a way to allow virtio-net to map and unmap the DMA
buffer by itself, but this patch goes into another way which seems to
query the address from the virtio core.

Personally, I think map and sync by the virtio-net driver seems clean.
But we can see.

Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler

2023-05-26 Thread Jason Wang
On Fri, May 26, 2023 at 1:47 PM Liang Chen  wrote:
>
> Adding a DMA pre-handler that utilizes page pool for managing DMA mappings.
> When IOMMU is enabled, turning on the page_pool_dma_map module parameter to
> select page pool for DMA mapping management gives a significant reduction
> in the overhead caused by DMA mappings.
>
> In testing environments with a single core vm and qemu emulated IOMMU,
> significant performance improvements can be observed:
>   Upstream codebase: 1.76 Gbits/sec
>   Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec
>   Upstream codebase with page pool fragmentation and DMA support: 19.3
>   Gbits/sec
>
> Signed-off-by: Liang Chen 
> ---
>  drivers/net/virtio_net.c | 55 
>  1 file changed, 55 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index ac40b8c66c59..73cc4f9fe4fa 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644);
>
>  static bool page_pool_enabled;
>  static bool page_pool_frag;
> +static bool page_pool_dma_map;
>  module_param(page_pool_enabled, bool, 0400);
>  module_param(page_pool_frag, bool, 0400);
> +module_param(page_pool_dma_map, bool, 0400);
>
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> @@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> virtnet_free_queues(vi);
>  }
>
> +static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page 
> *page,
> + unsigned long offset, size_t size,
> + enum dma_data_direction dir, 
> unsigned long attrs)
> +{
> +   struct page *head_page;
> +
> +   if (dir != DMA_FROM_DEVICE)
> +   return 0;
> +
> +   head_page = compound_head(page);
> +   return page_pool_get_dma_addr(head_page)
> +   + (page - head_page) * PAGE_SIZE
> +   + offset;

So it's not a map, it is just a query from the dma address from the pool.

> +}
> +
> +static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t 
> dma_handle,
> + size_t size, enum dma_data_direction 
> dir,
> + unsigned long attrs)
> +{
> +   phys_addr_t phys;
> +
> +   /* Handle only the RX direction, and sync the DMA memory only if it's 
> not
> +* a DMA coherent architecture.
> +*/
> +   if (dir != DMA_FROM_DEVICE)
> +   return false;
> +
> +   if (dev_is_dma_coherent(dev))
> +   return true;
> +
> +   phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);

This would be somehow slow. If we track the mapping by driver, it
would be much faster.

More could be seen here:

https://lists.linuxfoundation.org/pipermail/virtualization/2023-May/066778.html

Thanks

> +   if (WARN_ON(!phys))
> +   return false;
> +
> +   arch_sync_dma_for_cpu(phys, size, dir);
> +   return true;
> +}
> +
> +static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = {
> +   .map_page = virtnet_pp_dma_map_page,
> +   .unmap_page = virtnet_pp_dma_unmap_page,
> +};
> +
>  static void virtnet_alloc_page_pool(struct receive_queue *rq)
>  {
> struct virtio_device *vdev = rq->vq->vdev;
> @@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct 
> receive_queue *rq)
> if (page_pool_frag)
> pp_params.flags |= PP_FLAG_PAGE_FRAG;
>
> +   /* Consider using page pool DMA support only when DMA API is used. */
> +   if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) &&
> +   page_pool_dma_map) {
> +   pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> +   pp_params.dma_dir = DMA_FROM_DEVICE;
> +   pp_params.max_len = PAGE_SIZE << pp_params.order;
> +   virtqueue_register_pre_dma_ops(rq->vq, 
> &virtnet_pp_pre_dma_ops);
> +   }
> +
> rq->page_pool = page_pool_create(&pp_params);
> if (IS_ERR(rq->page_pool)) {
> dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> --
> 2.31.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-28 Thread Jason Wang
On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin  wrote:
>
> On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > This patch validate
>
> validates
>
> > the used buffer length provided by the device
> > before trying to use it.
>
> before returning it to caller
>
> > This is done by remembering the in buffer
> > length in a dedicated array during virtqueue_add(), then we can fail
> > the virtqueue_get_buf() when we find the device is trying to give us a
> > used buffer length which is greater than we stored before.
>
> than what we stored
>
> >
> > This validation is disable
>
> disabled
>
> > by default via module parameter to unbreak
> > some existing devices since some legacy devices are known to report
> > buggy used length.
> >
> > Signed-off-by: Jason Wang 
>
> First I'm not merging this without more data about
> what is known to be broken and what is known to work well
> in the commit log. And how exactly do things work if used length
> is wrong?

Assuming the device is malicious, it would be very hard to answer.
Auditing and fuzzing won't cover every case. Instead of trying to seek
the answer, we can simply make sure the used in buffer length is
validated then we know we're fine or not.

> Second what's wrong with dma_desc_extra that we already maintain?
> Third motivation - it's part and parcel of the hardening effort yes?

They are different. dma_desc_extra is for a descriptor ring, but this
is for a used ring. Technically we can go back to iterate on the
descriptor ring for a legal used in buffer length. But it will have
worse performance.

> I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> we do more hardening. If it's irrevocably broken let's rip it out?

So the plan is

1) finish used ring validation (this had been proposed, merged and
reverted before notification hardening)
2) do notification hardening on top.

So let's leave it as is and I will do a rework after we finalize the
used ring validation.

Thanks

>
>
> > ---
> > Changes since V4:
> > - drop the flat for driver to suppress the check
> > - validation is disabled by default
> > - don't do validation for legacy device
> > - rebase and support virtqueue resize
> > ---
> >  drivers/virtio/virtio_ring.c | 75 
> >  1 file changed, 75 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 143f380baa1c..5b151605aaf8 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -15,6 +15,9 @@
> >  #include 
> >  #include 
> >
> > +static bool force_used_validation = false;
> > +module_param(force_used_validation, bool, 0444);
> > +
> >  #ifdef DEBUG
> >  /* For development, we want to crash whenever the ring is screwed. */
> >  #define BAD_RING(_vq, fmt, args...)  \
> > @@ -105,6 +108,9 @@ struct vring_virtqueue_split {
> >   struct vring_desc_state_split *desc_state;
> >   struct vring_desc_extra *desc_extra;
> >
> > + /* Maximum in buffer length, NULL means no used validation */
> > + u32 *buflen;
> > +
> >   /* DMA address and size information */
> >   dma_addr_t queue_dma_addr;
> >   size_t queue_size_in_bytes;
> > @@ -145,6 +151,9 @@ struct vring_virtqueue_packed {
> >   struct vring_desc_state_packed *desc_state;
> >   struct vring_desc_extra *desc_extra;
> >
> > + /* Maximum in buffer length, NULL means no used validation */
> > + u32 *buflen;
> > +
> >   /* DMA address and size information */
> >   dma_addr_t ring_dma_addr;
> >   dma_addr_t driver_event_dma_addr;
> > @@ -552,6 +561,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> > *_vq,
> >   unsigned int i, n, avail, descs_used, prev, err_idx;
> >   int head;
> >   bool indirect;
> > + u32 buflen = 0;
> >
> >   START_USE(vq);
> >
> > @@ -635,6 +645,7 @@ static inline int virtqueue_add_split(struct virtqueue 
> > *_vq,
> >VRING_DESC_F_NEXT |
> >VRING_DESC_F_WRITE,
> >indirect);
> > + buflen += sg->length;
> >   }
> >   }
> >   /* Last one doesn't continue. */
> > @@ -675,6 +686,10 @@ static inline int virtqueue_add_split(struct virtqueue 
> >

Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-28 Thread Jason Wang
On Sun, May 28, 2023 at 7:39 PM Michael S. Tsirkin  wrote:
>
> On Fri, May 26, 2023 at 09:31:34AM +0800, Jason Wang wrote:
> > On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin  wrote:
> > >
> > > On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote:
> > > > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote:
> > > > > > This patch convert rx mode setting to be done in a workqueue, this 
> > > > > > is
> > > > > > a must for allow to sleep when waiting for the cvq command to
> > > > > > response since current code is executed under addr spin lock.
> > > > > >
> > > > > > Signed-off-by: Jason Wang 
> > > > > > ---
> > > > > > Changes since V1:
> > > > > > - use RTNL to synchronize rx mode worker
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 55 
> > > > > > +---
> > > > > >  1 file changed, 52 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index 56ca1d270304..5d2f1da4eaa0 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -265,6 +265,12 @@ struct virtnet_info {
> > > > > >   /* Work struct for config space updates */
> > > > > >   struct work_struct config_work;
> > > > > >
> > > > > > + /* Work struct for config rx mode */
> > > > >
> > > > > With a bit less abbreviation maybe? setting rx mode?
> > > >
> > > > That's fine.
> > > >
> > > > >
> > > > > > + struct work_struct rx_mode_work;
> > > > > > +
> > > > > > + /* Is rx mode work enabled? */
> > > > >
> > > > > Ugh not a great comment.
> > > >
> > > > Any suggestions for this. E.g we had:
> > > >
> > > > /* Is delayed refill enabled? */
> > >
> > > /* OK to queue work setting RX mode? */
> >
> > Ok.
> >
> > >
> > >
> > > > >
> > > > > > + bool rx_mode_work_enabled;
> > > > > > +
> > > > >
> > > > >
> > > > >
> > > > > >   /* Does the affinity hint is set for virtqueues? */
> > > > > >   bool affinity_hint_set;
> > > > > >
> > > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct 
> > > > > > virtnet_info *vi)
> > > > > >   spin_unlock_bh(&vi->refill_lock);
> > > > > >  }
> > > > > >
> > > > > > +static void enable_rx_mode_work(struct virtnet_info *vi)
> > > > > > +{
> > > > > > + rtnl_lock();
> > > > > > + vi->rx_mode_work_enabled = true;
> > > > > > + rtnl_unlock();
> > > > > > +}
> > > > > > +
> > > > > > +static void disable_rx_mode_work(struct virtnet_info *vi)
> > > > > > +{
> > > > > > + rtnl_lock();
> > > > > > + vi->rx_mode_work_enabled = false;
> > > > > > + rtnl_unlock();
> > > > > > +}
> > > > > > +
> > > > > >  static void virtqueue_napi_schedule(struct napi_struct *napi,
> > > > > >   struct virtqueue *vq)
> > > > > >  {
> > > > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device 
> > > > > > *dev)
> > > > > >   return 0;
> > > > > >  }
> > > > > >
> > > > > > -static void virtnet_set_rx_mode(struct net_device *dev)
> > > > > > +static void virtnet_rx_mode_work(struct work_struct *work)
> > > > > >  {
> > > > > > - struct virtnet_info *vi = netdev_priv(dev);
> > > > > > + struct virtnet_info *vi =
> > > > > > + container_of(work, struct vir

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-30 Thread Jason Wang
On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin  wrote:
>
> On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin  wrote:
> > >
> > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > This patch validate
> > >
> > > validates
> > >
> > > > the used buffer length provided by the device
> > > > before trying to use it.
> > >
> > > before returning it to caller
> > >
> > > > This is done by remembering the in buffer
> > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > the virtqueue_get_buf() when we find the device is trying to give us a
> > > > used buffer length which is greater than we stored before.
> > >
> > > than what we stored
> > >
> > > >
> > > > This validation is disable
> > >
> > > disabled
> > >
> > > > by default via module parameter to unbreak
> > > > some existing devices since some legacy devices are known to report
> > > > buggy used length.
> > > >
> > > > Signed-off-by: Jason Wang 
> > >
> > > First I'm not merging this without more data about
> > > what is known to be broken and what is known to work well
> > > in the commit log. And how exactly do things work if used length
> > > is wrong?
> >
> > Assuming the device is malicious, it would be very hard to answer.
> > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > the answer, we can simply make sure the used in buffer length is
> > validated then we know we're fine or not.
>
> To restate the question, you said above "some legacy devices are known
> to report buggy used length". If they report buggy length then how
> can things work?

The validation is disabled for legacy device (as stated in the changelog):

static bool vring_needs_used_validation(const struct virtio_device *vdev)
{
/*
 * Several legacy devices are known to produce buggy used
 * length. In order to let driver work, we won't validate used
 * buffer length in this case.
 */
if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
return false;
if (force_used_validation)
return true;
return false;
}

This seems to be what we've agreed in last version:

https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56

Thanks

>
> > > Second what's wrong with dma_desc_extra that we already maintain?
> > > Third motivation - it's part and parcel of the hardening effort yes?
> >
> > They are different. dma_desc_extra is for a descriptor ring, but this
> > is for a used ring. Technically we can go back to iterate on the
> > descriptor ring for a legal used in buffer length. But it will have
> > worse performance.
>
> I don't really understand. We already iterate when we unmap -
> all that is necessary is to subtract it from used length, if at
> the end of the process it is >0 then we know used length is too
> large.

Yes, but it is the job that is done in the driver level not the virtio
core. Validation in virtio core is still necessary since they're
working at different levels and it's hard to force the validation in
all drivers by codes. Last version introduces a
suppress_driver_validation to allow the driver to suppress the core
validation which seems not good, we need a way to force the
virtio_ring code to do validation before. Or such stuff could be added
on top since the validation is by default anyway.

Thanks

>
>
> > > I'd like to know the fate of VIRTIO_HARDEN_NOTIFICATION before
> > > we do more hardening. If it's irrevocably broken let's rip it out?
> >
> > So the plan is
> >
> > 1) finish used ring validation (this had been proposed, merged and
> > reverted before notification hardening)
> > 2) do notification hardening on top.
> >
> > So let's leave it as is and I will do a rework after we finalize the
> > used ring validation.
> >
> > Thanks
> >
> > >
> > >
> > > > ---
> > > > Changes since V4:
> > > > - drop the flat for driver to suppress the check
> > > > - validation is disabled by default
> > > > - don't do validation for legacy device
> > > > - rebase and support virtqueue resize
> > > > ---
> > > >  

Re: [PATCH V3 net-next 1/2] virtio-net: convert rx mode setting to use workqueue

2023-05-30 Thread Jason Wang
On Mon, May 29, 2023 at 9:21 AM Jason Wang  wrote:
>
> On Sun, May 28, 2023 at 7:39 PM Michael S. Tsirkin  wrote:
> >
> > On Fri, May 26, 2023 at 09:31:34AM +0800, Jason Wang wrote:
> > > On Thu, May 25, 2023 at 3:41 PM Michael S. Tsirkin  
> > > wrote:
> > > >
> > > > On Thu, May 25, 2023 at 11:43:34AM +0800, Jason Wang wrote:
> > > > > On Wed, May 24, 2023 at 5:15 PM Michael S. Tsirkin  
> > > > > wrote:
> > > > > >
> > > > > > On Wed, May 24, 2023 at 04:18:41PM +0800, Jason Wang wrote:
> > > > > > > This patch convert rx mode setting to be done in a workqueue, 
> > > > > > > this is
> > > > > > > a must for allow to sleep when waiting for the cvq command to
> > > > > > > response since current code is executed under addr spin lock.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang 
> > > > > > > ---
> > > > > > > Changes since V1:
> > > > > > > - use RTNL to synchronize rx mode worker
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 55 
> > > > > > > +---
> > > > > > >  1 file changed, 52 insertions(+), 3 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index 56ca1d270304..5d2f1da4eaa0 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -265,6 +265,12 @@ struct virtnet_info {
> > > > > > >   /* Work struct for config space updates */
> > > > > > >   struct work_struct config_work;
> > > > > > >
> > > > > > > + /* Work struct for config rx mode */
> > > > > >
> > > > > > With a bit less abbreviation maybe? setting rx mode?
> > > > >
> > > > > That's fine.
> > > > >
> > > > > >
> > > > > > > + struct work_struct rx_mode_work;
> > > > > > > +
> > > > > > > + /* Is rx mode work enabled? */
> > > > > >
> > > > > > Ugh not a great comment.
> > > > >
> > > > > Any suggestions for this. E.g we had:
> > > > >
> > > > > /* Is delayed refill enabled? */
> > > >
> > > > /* OK to queue work setting RX mode? */
> > >
> > > Ok.
> > >
> > > >
> > > >
> > > > > >
> > > > > > > + bool rx_mode_work_enabled;
> > > > > > > +
> > > > > >
> > > > > >
> > > > > >
> > > > > > >   /* Does the affinity hint is set for virtqueues? */
> > > > > > >   bool affinity_hint_set;
> > > > > > >
> > > > > > > @@ -388,6 +394,20 @@ static void disable_delayed_refill(struct 
> > > > > > > virtnet_info *vi)
> > > > > > >   spin_unlock_bh(&vi->refill_lock);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void enable_rx_mode_work(struct virtnet_info *vi)
> > > > > > > +{
> > > > > > > + rtnl_lock();
> > > > > > > + vi->rx_mode_work_enabled = true;
> > > > > > > + rtnl_unlock();
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void disable_rx_mode_work(struct virtnet_info *vi)
> > > > > > > +{
> > > > > > > + rtnl_lock();
> > > > > > > + vi->rx_mode_work_enabled = false;
> > > > > > > + rtnl_unlock();
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void virtqueue_napi_schedule(struct napi_struct *napi,
> > > > > > >   struct virtqueue *vq)
> > > > > > >  {
> > > > > > > @@ -2341,9 +2361,11 @@ static int virtnet_close(struct net_device 
> > > > > > > *dev)
> > > > > > >   return 0;
> > > > > > &g

Re: [PATCH v7 virtio 09/11] pds_vdpa: add support for vdpa and vdpamgmt interfaces

2023-05-30 Thread Jason Wang
On Sat, May 20, 2023 at 5:57 AM Shannon Nelson  wrote:
>
> This is the vDPA device support, where we advertise that we can
> support the virtio queues and deal with the configuration work
> through the pds_core's adminq.
>
> Signed-off-by: Shannon Nelson 
> ---
>
> Note: this had previously been Acked-by Jason Wang, but changed enough
>   in v6 that I felt it needs a new Ack.

Acked-by: Jason Wang 

Thanks

>
>  drivers/vdpa/pds/aux_drv.c  |  15 +
>  drivers/vdpa/pds/aux_drv.h  |   1 +
>  drivers/vdpa/pds/debugfs.c  | 263 
>  drivers/vdpa/pds/debugfs.h  |   5 +
>  drivers/vdpa/pds/vdpa_dev.c | 606 +++-
>  drivers/vdpa/pds/vdpa_dev.h |   4 +-
>  6 files changed, 892 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vdpa/pds/aux_drv.c b/drivers/vdpa/pds/aux_drv.c
> index 0c4a135b1484..186e9ee22eb1 100644
> --- a/drivers/vdpa/pds/aux_drv.c
> +++ b/drivers/vdpa/pds/aux_drv.c
> @@ -63,8 +63,21 @@ static int pds_vdpa_probe(struct auxiliary_device *aux_dev,
> goto err_free_mgmt_info;
> }
>
> +   /* Let vdpa know that we can provide devices */
> +   err = vdpa_mgmtdev_register(&vdpa_aux->vdpa_mdev);
> +   if (err) {
> +   dev_err(dev, "%s: Failed to initialize vdpa_mgmt interface: 
> %pe\n",
> +   __func__, ERR_PTR(err));
> +   goto err_free_virtio;
> +   }
> +
> +   pds_vdpa_debugfs_add_pcidev(vdpa_aux);
> +   pds_vdpa_debugfs_add_ident(vdpa_aux);
> +
> return 0;
>
> +err_free_virtio:
> +   vp_modern_remove(&vdpa_aux->vd_mdev);
>  err_free_mgmt_info:
> pci_free_irq_vectors(padev->vf_pdev);
>  err_free_mem:
> @@ -79,9 +92,11 @@ static void pds_vdpa_remove(struct auxiliary_device 
> *aux_dev)
> struct pds_vdpa_aux *vdpa_aux = auxiliary_get_drvdata(aux_dev);
> struct device *dev = &aux_dev->dev;
>
> +   vdpa_mgmtdev_unregister(&vdpa_aux->vdpa_mdev);
> vp_modern_remove(&vdpa_aux->vd_mdev);
> pci_free_irq_vectors(vdpa_aux->padev->vf_pdev);
>
> +   pds_vdpa_debugfs_del_vdpadev(vdpa_aux);
> kfree(vdpa_aux);
> auxiliary_set_drvdata(aux_dev, NULL);
>
> diff --git a/drivers/vdpa/pds/aux_drv.h b/drivers/vdpa/pds/aux_drv.h
> index 99e0ff340bfa..26b75344156e 100644
> --- a/drivers/vdpa/pds/aux_drv.h
> +++ b/drivers/vdpa/pds/aux_drv.h
> @@ -13,6 +13,7 @@ struct pds_vdpa_aux {
> struct pds_auxiliary_dev *padev;
>
> struct vdpa_mgmt_dev vdpa_mdev;
> +   struct pds_vdpa_device *pdsv;
>
> struct pds_vdpa_ident ident;
>
> diff --git a/drivers/vdpa/pds/debugfs.c b/drivers/vdpa/pds/debugfs.c
> index d91dceb07380..21a0dc0cb607 100644
> --- a/drivers/vdpa/pds/debugfs.c
> +++ b/drivers/vdpa/pds/debugfs.c
> @@ -10,6 +10,7 @@
>  #include 
>
>  #include "aux_drv.h"
> +#include "vdpa_dev.h"
>  #include "debugfs.h"
>
>  static struct dentry *dbfs_dir;
> @@ -24,3 +25,265 @@ void pds_vdpa_debugfs_destroy(void)
> debugfs_remove_recursive(dbfs_dir);
> dbfs_dir = NULL;
>  }
> +
> +#define PRINT_SBIT_NAME(__seq, __f, __name) \
> +   do {\
> +   if ((__f) & (__name))   \
> +   seq_printf(__seq, " %s", &#__name[16]); \
> +   } while (0)
> +
> +static void print_status_bits(struct seq_file *seq, u8 status)
> +{
> +   seq_puts(seq, "status:");
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_DRIVER_OK);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FEATURES_OK);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_NEEDS_RESET);
> +   PRINT_SBIT_NAME(seq, status, VIRTIO_CONFIG_S_FAILED);
> +   seq_puts(seq, "\n");
> +}
> +
> +static void print_feature_bits_all(struct seq_file *seq, u64 features)
> +{
> +   int i;
> +
> +   seq_puts(seq, "features:");
> +
> +   for (i = 0; i < (sizeof(u64) * 8); i++) {
> +   u64 mask = BIT_ULL(i);
> +
> +   switch (features & mask) {
> +   case BIT_ULL(VIRTIO_NET_F_CSUM):
> +   seq_puts(seq, " VIRTIO_NET_F_CSUM");
> +   break;
> +   case BIT_ULL(VIRTIO_NET_F_GUEST_CSUM):
> +   seq_puts(seq, " VIRTIO_NET_F_GUEST_CSUM");
>

Re: [PATCH] vduse: avoid empty string for dev name

2023-05-30 Thread Jason Wang
On Tue, May 30, 2023 at 11:37 AM Sheng Zhao  wrote:
>
> Syzkaller hits a kernel WARN when the first character of the dev name
> provided is NULL. Solution is to add a NULL check before calling
> cdev_device_add() in vduse_create_dev().
>
> kobject: (72042169): attempted to be registered with empty name!
> WARNING: CPU: 0 PID: 112695 at lib/kobject.c:236
> Call Trace:
>  kobject_add_varg linux/src/lib/kobject.c:390 [inline]
>  kobject_add+0xf6/0x150 linux/src/lib/kobject.c:442
>  device_add+0x28f/0xc20 linux/src/drivers/base/core.c:2167
>  cdev_device_add+0x83/0xc0 linux/src/fs/char_dev.c:546
>  vduse_create_dev linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2254 [inline]
>  vduse_ioctl+0x7b5/0xf30 linux/src/drivers/vdpa/vdpa_user/vduse_dev.c:2316
>  vfs_ioctl linux/src/fs/ioctl.c:47 [inline]
>  file_ioctl linux/src/fs/ioctl.c:510 [inline]
>  do_vfs_ioctl+0x14b/0xa80 linux/src/fs/ioctl.c:697
>  ksys_ioctl+0x7c/0xa0 linux/src/fs/ioctl.c:714
>  __do_sys_ioctl linux/src/fs/ioctl.c:721 [inline]
>  __se_sys_ioctl linux/src/fs/ioctl.c:719 [inline]
>  __x64_sys_ioctl+0x42/0x50 linux/src/fs/ioctl.c:719
>  do_syscall_64+0x94/0x330 linux/src/arch/x86/entry/common.c:291
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> Reported-by: Xianjun Zeng 
> Signed-off-by: Sheng Zhao 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> b/drivers/vdpa/vdpa_user/vduse_dev.c
> index de97e38c3b82..5f5c21674fdc 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -1685,6 +1685,9 @@ static bool vduse_validate_config(struct 
> vduse_dev_config *config)
> if (config->vq_num > 0x)
> return false;
>
> +   if (!config->name[0])
> +   return false;
> +
> if (!device_is_allowed(config->device_id))
> return false;
>
> --
> 2.20.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-30 Thread Jason Wang


在 2023/5/23 23:57, Eric W. Biederman 写道:

Oleg Nesterov  writes:


On 05/22, Oleg Nesterov wrote:

Right now I think that "int dead" should die,

No, probably we shouldn't call get_signal() if we have already
dequeued SIGKILL.

Very much agreed.  It is one thing to add a patch to move do_exit
out of get_signal.  It is another to keep calling get_signal after
that.  Nothing tests that case, and so we get some weird behaviors.



but let me think tomorrow.

May be something like this... I don't like it but I can't suggest anything 
better
right now.

bool killed = false;

for (;;) {
...

node = llist_del_all(&worker->work_list);
if (!node) {
schedule();
/*
 * When we get a SIGKILL our release function will
 * be called. That will stop new IOs from being queued
 * and check for outstanding cmd responses. It will then
 * call vhost_task_stop to tell us to return and exit.
 */
if (signal_pending(current)) {
struct ksignal ksig;

if (!killed)
killed = get_signal(&ksig);

clear_thread_flag(TIF_SIGPENDING);
}

continue;
}

I want to point out that we need to consider not just SIGKILL, but
SIGABRT that causes a coredump, as well as the process peforming
an ordinary exit(2).  All of which will cause get_signal to return
SIGKILL in this context.


---
But let me ask a couple of questions.

I share most of these questions.


Let's forget this patch, let's look at the
current code:

node = llist_del_all(&worker->work_list);
if (!node)
schedule();

node = llist_reverse_order(node);
... process works ...

To me this looks a bit confusing. Shouldn't we do

if (!node) {
schedule();
continue;
}

just to make the code a bit more clear? If node == NULL then
llist_reverse_order() and llist_for_each_entry_safe() will do nothing.
But this is minor.



/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);

I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
vhost_work_queue() can add this work again and change work->node->next.

That is why we use _safe, but we need to ensure that llist_for_each_safe()
completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.

So it seems that smp_wmb() can't help and should be removed, instead we need

llist_for_each_entry_safe(...) {
smp_mb__before_atomic();
clear_bit(VHOST_WORK_QUEUED, &work->flags);

Also, if the work->fn pointer is not stable, we should read it before
smp_mb__before_atomic() as well.

No?


__set_current_state(TASK_RUNNING);

Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
can return with current->state != RUNNING ?


work->fn(work);

Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
before we call work->fn(). Is it "safe" to run this callback with
signal_pending() or fatal_signal_pending() ?


Finally. I never looked into drivers/vhost/ before so I don't understand
this code at all, but let me ask anyway... Can we change vhost_dev_flush()
to run the pending callbacks rather than wait for vhost_worker() ?
I guess we can't, ->mm won't be correct, but can you confirm?

In a conversation long ago I remember hearing that vhost does not
support file descriptor passing.  Which means all of the file
descriptors should be in the same process.



It's not. Actually passing vhost fd is pretty common since Qemu is 
usually running without privilege. So it's the charge of the management 
layer to open vhost fd and pass it to Qemu.





Looking at the vhost code what I am seeing happening is that the
vhost_worker persists until vhost_dev_cleanup is called from
one of the vhost_???_release() functions.  The release functions
are only called after the last flush function completes.  See __fput
if you want to trace the details.


On one hand this all seems reasonable.  On the other hand I am not
seeing the code that prevents file descriptor passing.



Yes.





It is probably not the worst thing in the world, but what this means
is now if you pass a copy of the vhost file descriptor to another
process the vhost_worker will persis

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-30 Thread Jason Wang


在 2023/5/23 20:15, Oleg Nesterov 写道:

On 05/22, Oleg Nesterov wrote:

Right now I think that "int dead" should die,

No, probably we shouldn't call get_signal() if we have already dequeued SIGKILL.


but let me think tomorrow.

May be something like this... I don't like it but I can't suggest anything 
better
right now.

bool killed = false;

for (;;) {
...

node = llist_del_all(&worker->work_list);
if (!node) {
schedule();
/*
 * When we get a SIGKILL our release function will
 * be called. That will stop new IOs from being queued
 * and check for outstanding cmd responses. It will then
 * call vhost_task_stop to tell us to return and exit.
 */
if (signal_pending(current)) {
struct ksignal ksig;

if (!killed)
killed = get_signal(&ksig);

clear_thread_flag(TIF_SIGPENDING);
}

continue;
}

---
But let me ask a couple of questions. Let's forget this patch, let's look at the
current code:

node = llist_del_all(&worker->work_list);
if (!node)
schedule();

node = llist_reverse_order(node);
... process works ...

To me this looks a bit confusing. Shouldn't we do

if (!node) {
schedule();
continue;
}

just to make the code a bit more clear? If node == NULL then
llist_reverse_order() and llist_for_each_entry_safe() will do nothing.
But this is minor.



Yes.






/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);

I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
vhost_work_queue() can add this work again and change work->node->next.

That is why we use _safe, but we need to ensure that llist_for_each_safe()
completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.



This should be fine since store is not speculated, so work->node->next 
needs to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop 
condition.




So it seems that smp_wmb() can't help and should be removed, instead we need

llist_for_each_entry_safe(...) {
smp_mb__before_atomic();
clear_bit(VHOST_WORK_QUEUED, &work->flags);

Also, if the work->fn pointer is not stable, we should read it before
smp_mb__before_atomic() as well.



The fn won't be changed after it is initialized.




No?


__set_current_state(TASK_RUNNING);

Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
can return with current->state != RUNNING ?



It is because the state were set to TASK_INTERRUPTIBLE in the beginning 
of the loop otherwise it might be side effect while executing work->fn().






work->fn(work);

Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
before we call work->fn(). Is it "safe" to run this callback with
signal_pending() or fatal_signal_pending() ?



It looks safe since:

1) vhost hold refcnt of the mm
2) release will sync with the worker





Finally. I never looked into drivers/vhost/ before so I don't understand
this code at all, but let me ask anyway... Can we change vhost_dev_flush()
to run the pending callbacks rather than wait for vhost_worker() ?
I guess we can't, ->mm won't be correct, but can you confirm?



Yes.

Thanks




Oleg.



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin  wrote:
>
> On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin  wrote:
> > >
> > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > This patch validate
> > > > >
> > > > > validates
> > > > >
> > > > > > the used buffer length provided by the device
> > > > > > before trying to use it.
> > > > >
> > > > > before returning it to caller
> > > > >
> > > > > > This is done by remembering the in buffer
> > > > > > length in a dedicated array during virtqueue_add(), then we can fail
> > > > > > the virtqueue_get_buf() when we find the device is trying to give 
> > > > > > us a
> > > > > > used buffer length which is greater than we stored before.
> > > > >
> > > > > than what we stored
> > > > >
> > > > > >
> > > > > > This validation is disable
> > > > >
> > > > > disabled
> > > > >
> > > > > > by default via module parameter to unbreak
> > > > > > some existing devices since some legacy devices are known to report
> > > > > > buggy used length.
> > > > > >
> > > > > > Signed-off-by: Jason Wang 
> > > > >
> > > > > First I'm not merging this without more data about
> > > > > what is known to be broken and what is known to work well
> > > > > in the commit log. And how exactly do things work if used length
> > > > > is wrong?
> > > >
> > > > Assuming the device is malicious, it would be very hard to answer.
> > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > the answer, we can simply make sure the used in buffer length is
> > > > validated then we know we're fine or not.
> > >
> > > To restate the question, you said above "some legacy devices are known
> > > to report buggy used length". If they report buggy length then how
> > > can things work?
> >
> > The validation is disabled for legacy device (as stated in the changelog):
> >
> > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > {
> > /*
> >  * Several legacy devices are known to produce buggy used
> >  * length. In order to let driver work, we won't validate used
> >  * buffer length in this case.
> >  */
> > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > return false;
> > if (force_used_validation)
> > return true;
> > return false;
> > }
> >
> > This seems to be what we've agreed in last version:
> >
> > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> >
> > Thanks
> >
>
> I don't get it. You wrote:
>
> This validation is disable
> by default via module parameter to unbreak
> some existing devices since some legacy devices are known to report
> buggy used length.
>
> which devices?

legacy rpmsg and vsock device (before 49d8c5ffad07) at least.

> why do you need a module parameter?

If we enable it unconditionally for modern devices, it may break some
buggy moden device (vsock without a fix as an example).

>
>
> > >
> > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > Third motivation - it's part and parcel of the hardening effort yes?
> > > >
> > > > They are different. dma_desc_extra is for a descriptor ring, but this
> > > > is for a used ring. Technically we can go back to iterate on the
> > > > descriptor ring for a legal used in buffer length. But it will have
> > > > worse performance.
> > >
> > > I don't really understand. We already iterate when we unmap -
> > > all that is necessary is to subtract it from used length, if at
> > > the end of the process it is >0 then we know used leng

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 3:25 PM Oleg Nesterov  wrote:
>
> On 05/31, Jason Wang wrote:
> >
> > 在 2023/5/23 20:15, Oleg Nesterov 写道:
> > >
> > > /* make sure flag is seen after deletion */
> > > smp_wmb();
> > > llist_for_each_entry_safe(work, work_next, node, node) {
> > > clear_bit(VHOST_WORK_QUEUED, &work->flags);
> > >
> > >I am not sure about smp_wmb + clear_bit. Once we clear VHOST_WORK_QUEUED,
> > >vhost_work_queue() can add this work again and change work->node->next.
> > >
> > >That is why we use _safe, but we need to ensure that llist_for_each_safe()
> > >completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.
> >
> > This should be fine since store is not speculated, so work->node->next needs
> > to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop condition.
>
> I don't understand you. OK, to simplify, suppose we have 2 global vars
>
> void *PTR = something_non_null;
> unsigned long FLAGS = -1ul;
>
> Now I think this code
>
> CPU_0   CPU_1
>
> void *ptr = PTR;if (!test_and_set_bit(0, FLAGS))
> clear_bit(0, FLAGS);PTR = NULL;
> BUG_ON(!ptr);
>
> is racy and can hit the BUG_ON(!ptr).

This seems different to the above case? And you can hit BUG_ON with
the following execution sequence:

[cpu 0] clear_bit(0, FLAGS);
[cpu 1] if (!test_and_set_bit(0, FLAGS))
[cpu 1] PTR = NULL;
[cpu 0] BUG_ON(!ptr)

In vhost code, there's a condition before the clear_bit() which sits
inside llist_for_each_entry_safe():

#define llist_for_each_entry_safe(pos, n, node, member)\
for (pos = llist_entry((node), typeof(*pos), member);  \
 member_address_is_nonnull(pos, member) && \
(n = llist_entry(pos->member.next, typeof(*n), member), true); \
 pos = n)

The clear_bit() is a store which is not speculated, so there's a
control dependency, the store can't be executed until the condition
expression is evaluated which requires pos->member.next
(work->node.next) to be loaded.

>
> I guess it is fine on x86, but in general you need smp_mb__before_atomic()
> before clear_bit(), or clear_bit_unlock().
>
> > > __set_current_state(TASK_RUNNING);
> > >
> > >Why do we set TASK_RUNNING inside the loop? Does this mean that work->fn()
> > >can return with current->state != RUNNING ?
> >
> > It is because the state were set to TASK_INTERRUPTIBLE in the beginning of
> > the loop otherwise it might be side effect while executing work->fn().
>
> Again, I don't understand you. So let me repeat: can work->fn() return with
> current->_state != TASK_RUNNING ? If not (and I'd say it should not), you can
> do __set_current_state(TASK_RUNNING) once, before llist_for_each_entry_safe().
>

Ok, that should be fine.

Thanks


> > >Now the main question. Whatever we do, SIGKILL/SIGSTOP/etc can come right
> > >before we call work->fn(). Is it "safe" to run this callback with
> > >signal_pending() or fatal_signal_pending() ?
> >
> > It looks safe since:
> >
> > 1) vhost hold refcnt of the mm
> > 2) release will sync with the worker
>
> Well, that's not what I asked... nevermind, please forget.
>
> Thanks.
>
> Oleg.
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 3:36 PM Jason Wang  wrote:
>
> On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin  wrote:
> >
> > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin  
> > > wrote:
> > > >
> > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin  
> > > > > wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > This patch validate
> > > > > >
> > > > > > validates
> > > > > >
> > > > > > > the used buffer length provided by the device
> > > > > > > before trying to use it.
> > > > > >
> > > > > > before returning it to caller
> > > > > >
> > > > > > > This is done by remembering the in buffer
> > > > > > > length in a dedicated array during virtqueue_add(), then we can 
> > > > > > > fail
> > > > > > > the virtqueue_get_buf() when we find the device is trying to give 
> > > > > > > us a
> > > > > > > used buffer length which is greater than we stored before.
> > > > > >
> > > > > > than what we stored
> > > > > >
> > > > > > >
> > > > > > > This validation is disable
> > > > > >
> > > > > > disabled
> > > > > >
> > > > > > > by default via module parameter to unbreak
> > > > > > > some existing devices since some legacy devices are known to 
> > > > > > > report
> > > > > > > buggy used length.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang 
> > > > > >
> > > > > > First I'm not merging this without more data about
> > > > > > what is known to be broken and what is known to work well
> > > > > > in the commit log. And how exactly do things work if used length
> > > > > > is wrong?
> > > > >
> > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > Auditing and fuzzing won't cover every case. Instead of trying to seek
> > > > > the answer, we can simply make sure the used in buffer length is
> > > > > validated then we know we're fine or not.
> > > >
> > > > To restate the question, you said above "some legacy devices are known
> > > > to report buggy used length". If they report buggy length then how
> > > > can things work?
> > >
> > > The validation is disabled for legacy device (as stated in the changelog):
> > >
> > > static bool vring_needs_used_validation(const struct virtio_device *vdev)
> > > {
> > > /*
> > >  * Several legacy devices are known to produce buggy used
> > >  * length. In order to let driver work, we won't validate used
> > >  * buffer length in this case.
> > >  */
> > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > return false;
> > > if (force_used_validation)
> > > return true;
> > > return false;
> > > }
> > >
> > > This seems to be what we've agreed in last version:
> > >
> > > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > >
> > > Thanks
> > >
> >
> > I don't get it. You wrote:
> >
> > This validation is disable
> > by default via module parameter to unbreak
> > some existing devices since some legacy devices are known to report
> > buggy used length.
> >
> > which devices?
>
> legacy rpmsg and vsock device (before 49d8c5ffad07) at least.
>
> > why do you need a module parameter?
>
> If we enable it unconditionally for modern devices, it may break some
> buggy moden device (vsock without a fix as an example).
>
> >
> >
> > > >
> > > > > > Second what's wrong with dma_desc_extra that we already maintain?
> > > > > > Third motivation - 

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 5:55 PM Michael S. Tsirkin  wrote:
>
> On Wed, May 31, 2023 at 03:36:51PM +0800, Jason Wang wrote:
> > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > This patch validate
> > > > > > >
> > > > > > > validates
> > > > > > >
> > > > > > > > the used buffer length provided by the device
> > > > > > > > before trying to use it.
> > > > > > >
> > > > > > > before returning it to caller
> > > > > > >
> > > > > > > > This is done by remembering the in buffer
> > > > > > > > length in a dedicated array during virtqueue_add(), then we can 
> > > > > > > > fail
> > > > > > > > the virtqueue_get_buf() when we find the device is trying to 
> > > > > > > > give us a
> > > > > > > > used buffer length which is greater than we stored before.
> > > > > > >
> > > > > > > than what we stored
> > > > > > >
> > > > > > > >
> > > > > > > > This validation is disable
> > > > > > >
> > > > > > > disabled
> > > > > > >
> > > > > > > > by default via module parameter to unbreak
> > > > > > > > some existing devices since some legacy devices are known to 
> > > > > > > > report
> > > > > > > > buggy used length.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang 
> > > > > > >
> > > > > > > First I'm not merging this without more data about
> > > > > > > what is known to be broken and what is known to work well
> > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > is wrong?
> > > > > >
> > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > Auditing and fuzzing won't cover every case. Instead of trying to 
> > > > > > seek
> > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > validated then we know we're fine or not.
> > > > >
> > > > > To restate the question, you said above "some legacy devices are known
> > > > > to report buggy used length". If they report buggy length then how
> > > > > can things work?
> > > >
> > > > The validation is disabled for legacy device (as stated in the 
> > > > changelog):
> > > >
> > > > static bool vring_needs_used_validation(const struct virtio_device 
> > > > *vdev)
> > > > {
> > > > /*
> > > >  * Several legacy devices are known to produce buggy used
> > > >  * length. In order to let driver work, we won't validate used
> > > >  * buffer length in this case.
> > > >  */
> > > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > return false;
> > > > if (force_used_validation)
> > > > return true;
> > > > return false;
> > > > }
> > > >
> > > > This seems to be what we've agreed in last version:
> > > >
> > > > https://lore.kernel.org/all/canlsykxfhamuu0bb4j7y6n4_g9odkxlcjxxgxex4sj6_kf+...@mail.gmail.com/T/#m31f3b06f9032beec175c312dfa2532cb08b15c56
> > > >
> > > > Thanks
> > > >
> > >
> > > I don't get it. You wrote:
> > >
> > > This validation is disable
> > > by default via module parameter to unbreak
> > > some existing devices since so

Re: [PATCH] virtio_ring: validate used buffer length

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 6:25 PM Michael S. Tsirkin  wrote:
>
> On Wed, May 31, 2023 at 04:26:38PM +0800, Jason Wang wrote:
> > On Wed, May 31, 2023 at 3:36 PM Jason Wang  wrote:
> > >
> > > On Wed, May 31, 2023 at 1:50 PM Michael S. Tsirkin  
> > > wrote:
> > > >
> > > > On Wed, May 31, 2023 at 09:05:00AM +0800, Jason Wang wrote:
> > > > > On Mon, May 29, 2023 at 6:03 PM Michael S. Tsirkin  
> > > > > wrote:
> > > > > >
> > > > > > On Mon, May 29, 2023 at 09:18:10AM +0800, Jason Wang wrote:
> > > > > > > On Sun, May 28, 2023 at 3:57 PM Michael S. Tsirkin 
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > On Fri, May 26, 2023 at 02:30:41PM +0800, Jason Wang wrote:
> > > > > > > > > This patch validate
> > > > > > > >
> > > > > > > > validates
> > > > > > > >
> > > > > > > > > the used buffer length provided by the device
> > > > > > > > > before trying to use it.
> > > > > > > >
> > > > > > > > before returning it to caller
> > > > > > > >
> > > > > > > > > This is done by remembering the in buffer
> > > > > > > > > length in a dedicated array during virtqueue_add(), then we 
> > > > > > > > > can fail
> > > > > > > > > the virtqueue_get_buf() when we find the device is trying to 
> > > > > > > > > give us a
> > > > > > > > > used buffer length which is greater than we stored before.
> > > > > > > >
> > > > > > > > than what we stored
> > > > > > > >
> > > > > > > > >
> > > > > > > > > This validation is disable
> > > > > > > >
> > > > > > > > disabled
> > > > > > > >
> > > > > > > > > by default via module parameter to unbreak
> > > > > > > > > some existing devices since some legacy devices are known to 
> > > > > > > > > report
> > > > > > > > > buggy used length.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang 
> > > > > > > >
> > > > > > > > First I'm not merging this without more data about
> > > > > > > > what is known to be broken and what is known to work well
> > > > > > > > in the commit log. And how exactly do things work if used length
> > > > > > > > is wrong?
> > > > > > >
> > > > > > > Assuming the device is malicious, it would be very hard to answer.
> > > > > > > Auditing and fuzzing won't cover every case. Instead of trying to 
> > > > > > > seek
> > > > > > > the answer, we can simply make sure the used in buffer length is
> > > > > > > validated then we know we're fine or not.
> > > > > >
> > > > > > To restate the question, you said above "some legacy devices are 
> > > > > > known
> > > > > > to report buggy used length". If they report buggy length then how
> > > > > > can things work?
> > > > >
> > > > > The validation is disabled for legacy device (as stated in the 
> > > > > changelog):
> > > > >
> > > > > static bool vring_needs_used_validation(const struct virtio_device 
> > > > > *vdev)
> > > > > {
> > > > > /*
> > > > >  * Several legacy devices are known to produce buggy used
> > > > >  * length. In order to let driver work, we won't validate used
> > > > >  * buffer length in this case.
> > > > >  */
> > > > > if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
> > > > > return false;
> > > > > if (force_used_validation)
> > > > > return true;
> > > > > return false;
> > > > > }
> > > > >
> > > > > This seems to be what we've agreed in last version:
>

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-05-31 Thread Jason Wang
On Wed, May 31, 2023 at 5:14 PM Oleg Nesterov  wrote:
>
> On 05/31, Jason Wang wrote:
> >
> > On Wed, May 31, 2023 at 3:25 PM Oleg Nesterov  wrote:
> > >
> > > On 05/31, Jason Wang wrote:
> > > >
> > > > 在 2023/5/23 20:15, Oleg Nesterov 写道:
> > > > >
> > > > > /* make sure flag is seen after deletion */
> > > > > smp_wmb();
> > > > > llist_for_each_entry_safe(work, work_next, node, node) {
> > > > > clear_bit(VHOST_WORK_QUEUED, &work->flags);
> > > > >
> > > > >I am not sure about smp_wmb + clear_bit. Once we clear 
> > > > >VHOST_WORK_QUEUED,
> > > > >vhost_work_queue() can add this work again and change work->node->next.
> > > > >
> > > > >That is why we use _safe, but we need to ensure that 
> > > > >llist_for_each_safe()
> > > > >completes LOAD(work->node->next) before VHOST_WORK_QUEUED is cleared.
> > > >
> > > > This should be fine since store is not speculated, so work->node->next 
> > > > needs
> > > > to be loaded before VHOST_WORK_QUEUED is cleared to meet the loop 
> > > > condition.
> > >
> > > I don't understand you. OK, to simplify, suppose we have 2 global vars
> > >
> > > void *PTR = something_non_null;
> > > unsigned long FLAGS = -1ul;
> > >
> > > Now I think this code
> > >
> > > CPU_0   CPU_1
> > >
> > > void *ptr = PTR;if (!test_and_set_bit(0, FLAGS))
> > > clear_bit(0, FLAGS);PTR = NULL;
> > > BUG_ON(!ptr);
> > >
> > > is racy and can hit the BUG_ON(!ptr).
> >
> > This seems different to the above case?
>
> not sure,
>
> > And you can hit BUG_ON with
> > the following execution sequence:
> >
> > [cpu 0] clear_bit(0, FLAGS);
> > [cpu 1] if (!test_and_set_bit(0, FLAGS))
> > [cpu 1] PTR = NULL;
> > [cpu 0] BUG_ON(!ptr)
>
> I don't understand this part... yes, we can hit this BUG_ON() without mb in
> between, this is what I tried to say.

I may miss something, but the above is the sequence that is executed
by the processor (for each CPU, it's just the program order). So where
do you expect to place an mb can help?

>
> > In vhost code, there's a condition before the clear_bit() which sits
> > inside llist_for_each_entry_safe():
> >
> > #define llist_for_each_entry_safe(pos, n, node, member) 
> >\
> > for (pos = llist_entry((node), typeof(*pos), member);   
> >\
> >  member_address_is_nonnull(pos, member) &&  
> >\
> > (n = llist_entry(pos->member.next, typeof(*n), member), 
> > true); \
> >  pos = n)
> >
> > The clear_bit() is a store which is not speculated, so there's a
> > control dependency, the store can't be executed until the condition
> > expression is evaluated which requires pos->member.next
> > (work->node.next) to be loaded.
>
> But llist_for_each_entry_safe() doesn't check "n", I mean, it is not that we 
> have
> something like
>
> n = llist_entry(...);
> if (n)
> clear_bit(...);
>
> so I do not see how can we rely on the load-store control dependency.

Just to make sure we are on the same page, the condition expression is

member_address_is_nonnull(pos, member) && (n =
llist_entry(pos->member.next, typeof(*n), member), true)

So it's something like:

if (work->node && (work_next = work->node->next, true))
clear_bit(&work->flags);

So two loads from both work->node and work->node->next, and there's a
store which is clear_bit, then it's a load-store control dependencies?

Thanks

>
> Oleg.
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/3] fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

2023-06-01 Thread Jason Wang
On Thu, Jun 1, 2023 at 3:43 PM Oleg Nesterov  wrote:
>
> On 06/01, Jason Wang wrote:
> >
> > On Wed, May 31, 2023 at 5:14 PM Oleg Nesterov  wrote:
> > >
> > > > > I don't understand you. OK, to simplify, suppose we have 2 global vars
> > > > >
> > > > > void *PTR = something_non_null;
> > > > > unsigned long FLAGS = -1ul;
> > > > >
> > > > > Now I think this code
> > > > >
> > > > > CPU_0   CPU_1
> > > > >
> > > > > void *ptr = PTR;if (!test_and_set_bit(0, 
> > > > > FLAGS))
> > > > > clear_bit(0, FLAGS);PTR = NULL;
> > > > > BUG_ON(!ptr);
> > > > >
> > > > > is racy and can hit the BUG_ON(!ptr).
> > > >
> > > > This seems different to the above case?
> > >
> > > not sure,
> > >
> > > > And you can hit BUG_ON with
> > > > the following execution sequence:
> > > >
> > > > [cpu 0] clear_bit(0, FLAGS);
> > > > [cpu 1] if (!test_and_set_bit(0, FLAGS))
> > > > [cpu 1] PTR = NULL;
> > > > [cpu 0] BUG_ON(!ptr)
> > >
> > > I don't understand this part... yes, we can hit this BUG_ON() without mb 
> > > in
> > > between, this is what I tried to say.
> >
> > I may miss something,
>
> Or me... note that CPU_0 loads the global "PTR" into the local "ptr" before 
> clear_bit.
> Since you have mentioned the program order: yes this lacks READ_ONCE() or 
> barrier(),
> but the same is true for the code in vhost_worker(). So I still don't 
> understand.
>
> > but the above is the sequence that is executed
> > by the processor (for each CPU, it's just the program order). So where
> > do you expect to place an mb can help?
>
> before clear_bit:
>
> CPU_0
>
> void *ptr = PTR;
> mb();   // implies compiler barrier as well
> clear_bit(0, FLAGS);
> BUG_ON(!ptr);
>
> just in case... mb() in the code above is only for illustration, we can use
> smp_mb__before_atomic() + clear_bit(). Or just clear_bit_unlock(), iiuc the
> one-way barrier is fine in this case.

Ok, but it seems different, in the case of vhost we had a condition
above the clear_bit().

>
>
> > > > In vhost code, there's a condition before the clear_bit() which sits
> > > > inside llist_for_each_entry_safe():
> > > >
> > > > #define llist_for_each_entry_safe(pos, n, node, member) 
> > > >\
> > > > for (pos = llist_entry((node), typeof(*pos), member);   
> > > >\
> > > >  member_address_is_nonnull(pos, member) &&  
> > > >\
> > > > (n = llist_entry(pos->member.next, typeof(*n), member), 
> > > > true); \
> > > >  pos = n)
> > > >
> > > > The clear_bit() is a store which is not speculated, so there's a
> > > > control dependency, the store can't be executed until the condition
> > > > expression is evaluated which requires pos->member.next
> > > > (work->node.next) to be loaded.
> > >
> > > But llist_for_each_entry_safe() doesn't check "n", I mean, it is not that 
> > > we have
> > > something like
> > >
> > > n = llist_entry(...);
> > > if (n)
> > > clear_bit(...);
> > >
> > > so I do not see how can we rely on the load-store control dependency.
> >
> > Just to make sure we are on the same page, the condition expression is
> >
> > member_address_is_nonnull(pos, member) && (n =
> > llist_entry(pos->member.next, typeof(*n), member), true)
> >
> > So it's something like:
> >
> > if (work->node && (work_next = work->node->next, true))
> > clear_bit(&work->flags);
> >
> > So two loads from both work->node and work->node->next, and there's a
> > store which is clear_bit, then it's a load-store control dependencies?
>
> I guess you missed the comma expression...

Probably not, see below:

> Let me rewrite your pseudo-code
> above, it is equivalent to
>
> if (work->node) {
> if ((work_next = work->nod

Re: [PATCH] vp_vdpa: Check queue number of vdpa device from add_config

2023-06-04 Thread Jason Wang
On Fri, Jun 2, 2023 at 3:35 PM Angus Chen  wrote:
>
> When add virtio_pci vdpa device,check the vqs number of device cap
> and max_vq_pairs from add_config.
>
> Signed-off-by: Angus Chen 
> ---
>  drivers/vdpa/virtio_pci/vp_vdpa.c | 11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> b/drivers/vdpa/virtio_pci/vp_vdpa.c
> index 281287fae89f..4bf1ab637d32 100644
> --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> @@ -478,7 +478,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, 
> const char *name,
> struct device *dev = &pdev->dev;
> struct vp_vdpa *vp_vdpa = NULL;
> u64 device_features;
> -   int ret, i;
> +   int ret, i, queues;
>
> vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> dev, &vp_vdpa_ops, 1, 1, name, false);
> @@ -491,7 +491,14 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, 
> const char *name,
> vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
>
> vp_vdpa->vdpa.dma_dev = &pdev->dev;
> -   vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> +   queues = vp_modern_get_num_queues(mdev);
> +   if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> +   if (add_config->net.max_vq_pairs > queues / 2)
> +   return -EINVAL;
> +   queues = min_t(u32, queues, 2 * add_config->net.max_vq_pairs);

Looks like you want to mediate the max_vqp here, but what happens:

1) harware have 4 queue paris
2) vp_vdpa cap it into 2 queue pairs
3) guest may still try to enable 4 queue paris

For 3), the kernel needs to mediate the control virtqueue which seems not easy.

How about simply starting from failing if the provisioned #qp is not
equal to the one that hardware has?

Thanks

> +   }
> +
> +   vp_vdpa->queues = queues;
> vp_vdpa->mdev = mdev;
>
> device_features = vp_modern_get_features(mdev);
> --
> 2.25.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-05 Thread Jason Wang
On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella  wrote:
>
> On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin wrote:
> >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella wrote:
> >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin wrote:
> >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella wrote:
> >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. Tsirkin wrote:
> >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano Garzarella wrote:
> >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, VHOST_SET_VRING_BASE)
> >> > > > > don't support packed virtqueue well yet, so let's filter the
> >> > > > > VIRTIO_F_RING_PACKED feature for now in vhost_vdpa_get_features().
> >> > > > >
> >> > > > > This way, even if the device supports it, we don't risk it being
> >> > > > > negotiated, then the VMM is unable to set the vring state properly.
> >> > > > >
> >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
> >> > > > > Cc: sta...@vger.kernel.org
> >> > > > > Signed-off-by: Stefano Garzarella 
> >> > > > > ---
> >> > > > >
> >> > > > > Notes:
> >> > > > > This patch should be applied before the "[PATCH v2 0/3] 
> >> > > > > vhost_vdpa:
> >> > > > > better PACKED support" series [1] and backported in stable 
> >> > > > > branches.
> >> > > > >
> >> > > > > We can revert it when we are sure that everything is working 
> >> > > > > with
> >> > > > > packed virtqueues.
> >> > > > >
> >> > > > > Thanks,
> >> > > > > Stefano
> >> > > > >
> >> > > > > [1] 
> >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/
> >> > > >
> >> > > > I'm a bit lost here. So why am I merging "better PACKED support" 
> >> > > > then?
> >> > >
> >> > > To really support packed virtqueue with vhost-vdpa, at that point we 
> >> > > would
> >> > > also have to revert this patch.
> >> > >
> >> > > I wasn't sure if you wanted to queue the series for this merge window.
> >> > > In that case do you think it is better to send this patch only for 
> >> > > stable
> >> > > branches?
> >> > > > Does this patch make them a NOP?
> >> > >
> >> > > Yep, after applying the "better PACKED support" series and being
> >> > > sure that
> >> > > the IOCTLs of vhost-vdpa support packed virtqueue, we should revert 
> >> > > this
> >> > > patch.
> >> > >
> >> > > Let me know if you prefer a different approach.
> >> > >
> >> > > I'm concerned that QEMU uses vhost-vdpa IOCTLs thinking that the kernel
> >> > > interprets them the right way, when it does not.
> >> > >
> >> > > Thanks,
> >> > > Stefano
> >> > >
> >> >
> >> > If this fixes a bug can you add Fixes tags to each of them? Then it's ok
> >> > to merge in this window. Probably easier than the elaborate
> >> > mask/unmask dance.
> >>
> >> CCing Shannon (the original author of the "better PACKED support"
> >> series).
> >>
> >> IIUC Shannon is going to send a v3 of that series to fix the
> >> documentation, so Shannon can you also add the Fixes tags?
> >>
> >> Thanks,
> >> Stefano
> >
> >Well this is in my tree already. Just reply with
> >Fixes: <>
> >to each and I will add these tags.
>
> I tried, but it is not easy since we added the support for packed
> virtqueue in vdpa and vhost incrementally.
>
> Initially I was thinking of adding the same tag used here:
>
> Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
>
> Then I discovered that vq_state wasn't there, so I was thinking of
>
> Fixes: 530a5678bc00 ("vdpa: support packed virtqueue for set/get_vq_state()")
>
> So we would have to backport quite a few patches into the stable branches.
> I don't know if it's worth it...
>
> I still think it is better to disable packed in the stable branches,
> otherwise I have to make a list of all the patches we need.
>
> Any other ideas?

AFAIK, except for vp_vdpa, pds seems to be the first parent that
supports packed virtqueue. Users should not notice anything wrong if
they don't use packed virtqueue. And the problem of vp_vdpa + packed
virtqueue came since the day0 of vp_vdpa. It seems fine to do nothing
I guess.

Thanks

>
> Thanks,
> Stefano
>
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net] virtio_net: use control_buf for coalesce params

2023-06-05 Thread Jason Wang
On Tue, Jun 6, 2023 at 3:59 AM Brett Creeley  wrote:
>
> Commit 699b045a8e43 ("net: virtio_net: notifications coalescing
> support") added coalescing command support for virtio_net. However,
> the coalesce commands are using buffers on the stack, which is causing
> the device to see DMA errors. There should also be a complaint from
> check_for_stack() in debug_dma_map_xyz(). Fix this by adding and using
> coalesce params from the control_buf struct, which aligns with other
> commands.
>
> Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support")
> Reviewed-by: Shannon Nelson 
> Signed-off-by: Allen Hubbe 
> Signed-off-by: Brett Creeley 
> ---
>  drivers/net/virtio_net.c | 16 ++++----

The patch is needed for -stable I think.

Acked-by: Jason Wang 

Thanks

>  1 file changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 56ca1d270304..486b5849033d 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -205,6 +205,8 @@ struct control_buf {
> __virtio16 vid;
> __virtio64 offloads;
> struct virtio_net_ctrl_rss rss;
> +   struct virtio_net_ctrl_coal_tx coal_tx;
> +   struct virtio_net_ctrl_coal_rx coal_rx;
>  };
>
>  struct virtnet_info {
> @@ -2934,12 +2936,10 @@ static int virtnet_send_notf_coal_cmds(struct 
> virtnet_info *vi,
>struct ethtool_coalesce *ec)
>  {
> struct scatterlist sgs_tx, sgs_rx;
> -   struct virtio_net_ctrl_coal_tx coal_tx;
> -   struct virtio_net_ctrl_coal_rx coal_rx;
>
> -   coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
> -   coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames);
> -   sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx));
> +   vi->ctrl->coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
> +   vi->ctrl->coal_tx.tx_max_packets = 
> cpu_to_le32(ec->tx_max_coalesced_frames);
> +   sg_init_one(&sgs_tx, &vi->ctrl->coal_tx, sizeof(vi->ctrl->coal_tx));
>
> if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
>   VIRTIO_NET_CTRL_NOTF_COAL_TX_SET,
> @@ -2950,9 +2950,9 @@ static int virtnet_send_notf_coal_cmds(struct 
> virtnet_info *vi,
> vi->tx_usecs = ec->tx_coalesce_usecs;
> vi->tx_max_packets = ec->tx_max_coalesced_frames;
>
> -   coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
> -   coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
> -   sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx));
> +   vi->ctrl->coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
> +   vi->ctrl->coal_rx.rx_max_packets = 
> cpu_to_le32(ec->rx_max_coalesced_frames);
> +   sg_init_one(&sgs_rx, &vi->ctrl->coal_rx, sizeof(vi->ctrl->coal_rx));
>
> if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
>   VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
> --
> 2.17.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH net] virtio_net: Prevent napi_weight changes with VIRTIO_NET_F_NOTF_COAL support

2023-06-05 Thread Jason Wang
On Tue, Jun 6, 2023 at 5:03 AM Brett Creeley  wrote:
>
> Commit 699b045a8e43 ("net: virtio_net: notifications coalescing
> support") added support for VIRTIO_NET_F_NOTF_COAL. The get_coalesce
> call made changes to report "1" in tx_max_coalesced_frames if
> VIRTIO_NET_F_NOTF_COAL is not supported and napi.weight is non-zero.
> However, the napi_weight value could still be changed by the
> set_coalesce call regardless of whether or not the device supports
> VIRTIO_NET_F_NOTF_COAL.
>
> It seems like the tx_max_coalesced_frames value should not control more
> than 1 thing (i.e. napi_weight and the device's tx_max_packets). So, fix
> this by only allowing the napi_weight change if VIRTIO_NET_F_NOTF_COAL
> is not supported by the virtio device.
>
> It wasn't clear to me if this was the intended behavior, so that's why
> I'm sending this as an RFC patch initially. Based on the feedback, I
> will resubmit as an official patch.

It seems the current code is fine since:

Before tx coalescing, we have two modes for tx interrupt:

1) TX NAPI mode, using NAPI to recycle xmit packets
2) TX no-NAPI mode, depends on the start_xmit() to recycle xmit packets

Each has their own use cases. E.g 1) seems to have better buffer
interaction with TCP. But 2) seems to behave better if user cares
about PPS and it can gives us 2x PPS when using a vhost-user backend.

So we leave an option to switch between those two via sq.napi_weight

ethtool -C tx-frames-irq 0 // To disable tx interrupts
ethtool -C tx-frames-irq 1 // To enable tx interrupts

After tx intr coleasing, we want to stick to this API.

ethtool -C tx-frames-irq 0 // To disable tx interrupts
ethtool -C tx-frames-irq N (N>=1) // To enable tx interrupts

Thanks

>
> Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support")
> Signed-off-by: Brett Creeley 
> ---
>  drivers/net/virtio_net.c | 24 +---
>  1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 486b5849033d..e28387866909 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2990,19 +2990,21 @@ static int virtnet_set_coalesce(struct net_device 
> *dev,
> int ret, i, napi_weight;
> bool update_napi = false;
>
> -   /* Can't change NAPI weight if the link is up */
> -   napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> -   if (napi_weight ^ vi->sq[0].napi.weight) {
> -   if (dev->flags & IFF_UP)
> -   return -EBUSY;
> -   else
> -   update_napi = true;
> -   }
> -
> -   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
> +   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
> ret = virtnet_send_notf_coal_cmds(vi, ec);
> -   else
> +   } else {
> +   /* Can't change NAPI weight if the link is up */
> +   napi_weight = ec->tx_max_coalesced_frames ?
> +   NAPI_POLL_WEIGHT : 0;
> +   if (napi_weight ^ vi->sq[0].napi.weight) {
> +   if (dev->flags & IFF_UP)
> +   return -EBUSY;
> +   else
> +   update_napi = true;
> +   }
> +
> ret = virtnet_coal_params_supported(ec);
> +   }
>
> if (ret)
> return ret;
> --
> 2.17.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH net] virtio_net: Prevent napi_weight changes with VIRTIO_NET_F_NOTF_COAL support

2023-06-05 Thread Jason Wang
On Tue, Jun 6, 2023 at 9:57 AM Xuan Zhuo  wrote:
>
> On Mon, 5 Jun 2023 14:02:36 -0700, Brett Creeley  
> wrote:
> > Commit 699b045a8e43 ("net: virtio_net: notifications coalescing
> > support") added support for VIRTIO_NET_F_NOTF_COAL. The get_coalesce
> > call made changes to report "1" in tx_max_coalesced_frames if
> > VIRTIO_NET_F_NOTF_COAL is not supported and napi.weight is non-zero.
> > However, the napi_weight value could still be changed by the
> > set_coalesce call regardless of whether or not the device supports
> > VIRTIO_NET_F_NOTF_COAL.
> >
> > It seems like the tx_max_coalesced_frames value should not control more
> > than 1 thing (i.e. napi_weight and the device's tx_max_packets). So, fix
> > this by only allowing the napi_weight change if VIRTIO_NET_F_NOTF_COAL
> > is not supported by the virtio device.
>
>
> @Jason I wonder should we keep this function to change the napi weight by the
> coalesec command.

I think so, explained in another thread.

Thanks

>
> Thanks.
>
> >
> > It wasn't clear to me if this was the intended behavior, so that's why
> > I'm sending this as an RFC patch initially. Based on the feedback, I
> > will resubmit as an official patch.
> >
> > Fixes: 699b045a8e43 ("net: virtio_net: notifications coalescing support")
> > Signed-off-by: Brett Creeley 
> > ---
> >  drivers/net/virtio_net.c | 24 +---
> >  1 file changed, 13 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 486b5849033d..e28387866909 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -2990,19 +2990,21 @@ static int virtnet_set_coalesce(struct net_device 
> > *dev,
> >   int ret, i, napi_weight;
> >   bool update_napi = false;
> >
> > - /* Can't change NAPI weight if the link is up */
> > - napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
> > - if (napi_weight ^ vi->sq[0].napi.weight) {
> > - if (dev->flags & IFF_UP)
> > - return -EBUSY;
> > - else
> > - update_napi = true;
> > - }
> > -
> > - if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
> > + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
> >   ret = virtnet_send_notf_coal_cmds(vi, ec);
> > - else
> > + } else {
> > + /* Can't change NAPI weight if the link is up */
> > + napi_weight = ec->tx_max_coalesced_frames ?
> > + NAPI_POLL_WEIGHT : 0;
> > + if (napi_weight ^ vi->sq[0].napi.weight) {
> > + if (dev->flags & IFF_UP)
> > + return -EBUSY;
> > + else
> > + update_napi = true;
> > + }
> > +
> >   ret = virtnet_coal_params_supported(ec);
> > + }
> >
> >   if (ret)
> >   return ret;
> > --
> > 2.17.1
> >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vp_vdpa: Check queue number of vdpa device from add_config

2023-06-06 Thread Jason Wang
On Tue, Jun 6, 2023 at 2:19 PM Angus Chen  wrote:
>
> Hi,Jason.
>
> > -Original Message-
> > From: Jason Wang 
> > Sent: Monday, June 5, 2023 2:54 PM
> > To: Angus Chen 
> > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org;
> > linux-ker...@vger.kernel.org
> > Subject: Re: [PATCH] vp_vdpa: Check queue number of vdpa device from
> > add_config
> >
> > On Fri, Jun 2, 2023 at 3:35 PM Angus Chen 
> > wrote:
> > >
> > > When add virtio_pci vdpa device,check the vqs number of device cap
> > > and max_vq_pairs from add_config.
> > >
> > > Signed-off-by: Angus Chen 
> > > ---
> > >  drivers/vdpa/virtio_pci/vp_vdpa.c | 11 +--
> > >  1 file changed, 9 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > index 281287fae89f..4bf1ab637d32 100644
> > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > @@ -478,7 +478,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev
> > *v_mdev, const char *name,
> > > struct device *dev = &pdev->dev;
> > > struct vp_vdpa *vp_vdpa = NULL;
> > > u64 device_features;
> > > -   int ret, i;
> > > +   int ret, i, queues;
> > >
> > > vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > > dev, &vp_vdpa_ops, 1, 1, name,
> > false);
> > > @@ -491,7 +491,14 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev
> > *v_mdev, const char *name,
> > > vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > >
> > > vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > > -   vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> > > +   queues = vp_modern_get_num_queues(mdev);
> > > +   if (add_config->mask &
> > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> > > +   if (add_config->net.max_vq_pairs > queues / 2)
> > > +   return -EINVAL;
> > > +   queues = min_t(u32, queues, 2 *
> > add_config->net.max_vq_pairs);
> >
> > Looks like you want to mediate the max_vqp here, but what happens:
> >
> > 1) harware have 4 queue paris
> > 2) vp_vdpa cap it into 2 queue pairs
> > 3) guest may still try to enable 4 queue paris
> >
> Yes,you are right,this situation can occur.
> > For 3), the kernel needs to mediate the control virtqueue which seems not
> > easy.
> >
> > How about simply starting from failing if the provisioned #qp is not
> > equal to the one that hardware has?
> Ok,You mean we just check it in vp_vdpa or check it in all other vdpa net 
> drivers?

vp_vdpa only, since in some other kind of parents, #qps could be provisioned.

Thanks

> >
> > Thanks
> >
> > > +   }
> > > +
> > > +   vp_vdpa->queues = queues;
> > > vp_vdpa->mdev = mdev;
> > >
> > > device_features = vp_modern_get_features(mdev);
> > > --
> > > 2.25.1
> > >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance

2023-06-07 Thread Jason Wang
On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin  wrote:
>
> On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > On Tue, May 30, 2023 at 9:19 AM Liang Chen  
> > wrote:
> > >
> > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin  
> > > wrote:
> > > >
> > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin  
> > > > > wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > The implementation at the moment uses one page per packet in both 
> > > > > > > the
> > > > > > > normal and XDP path. In addition, introducing a module parameter 
> > > > > > > to enable
> > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > >
> > > > > > > In single-core vm testing environments, it gives a modest 
> > > > > > > performance gain
> > > > > > > in the normal path.
> > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > >
> > > > > > > In multi-core vm testing environments, The most significant 
> > > > > > > performance
> > > > > > > gain is observed in XDP cpumap:
> > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > >
> > > > > > > With this foundation, we can further integrate page pool 
> > > > > > > fragmentation and
> > > > > > > DMA map/unmap support.
> > > > > > >
> > > > > > > Signed-off-by: Liang Chen 
> > > > > >
> > > > > > Why off by default?
> > > > > > I am guessing it sometimes has performance costs too?
> > > > > >
> > > > > >
> > > > > > What happens if we use page pool for big mode too?
> > > > > > The less modes we have the better...
> > > > > >
> > > > > >
> > > > >
> > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > packet size is very small, it reduces the likelihood of skb
> > > > > coalescing. But such cases are rare.
> > > >
> > > > small packets are rare? These workloads are easy to create actually.
> > > > Pls try and include benchmark with small packet size.
> > > >
> > >
> > > Sure, Thanks!
> >
> > Before going ahead and posting v2 patch, I would like to hear more
> > advice for the cases of small packets. I have done more performance
> > benchmark with small packets since then. Here is a list of iperf
> > output,
> >
> > With PP and PP fragmenting:
> > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec0144 
> > KBytes
> > 1K:   [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec0
> > 223 KBytes
> > 2K:   [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec0
> > 324 KBytes
> > 4K:   [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec0
> > 1.08 MBytes
> > 8K:   [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec0
> > 744 KBytes
> > 16K: [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec0963 
> > KBytes
> > 32K: [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec0   1.25 
> > MBytes
> > 64K: [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec0   1.70 
> > MBytes
> > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec0   4.26 
> > MBytes
> > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec0   3.20 
> > MBytes

Note that virtio-net driver is lacking things like BQL and others, so
it might suffer from buffer bloat for TCP performance. Would you mind
to measure with e.g using testpmd on the vhost to see the rx PPS?

> >
> > Without PP:
> > 256: [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec0359 
> > KBytes
> > 1K:  [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec0730 
> > KBytes
> > 2K:  [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec0   1.99 
> > MBytes
> > 4K:  [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec0   1.20 
> > MBytes
> > 8K:  [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec0   1.72 
> > MBytes
> > 16K:[  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec0   2.90 
> > MBytes
> > 32K:[  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec0   3.03 
> > MBytes
> > 64K:[  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec0   3.05 
> > MBytes
> > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec1   3.03 
> > MBytes
> > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec0   3.11 
> > MBytes
> >
> >
> > The major factor contributing to the performance drop is the reduction
> > of skb coalescing. Additionally, without the page pool, small packets
> > can still benefit from the allocation of 8 continuous pages by
> > breaking them down into smaller pieces. This effectively reduces the
> > frequency of page allocation from the buddy system. For instance, the
> > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > the benefits of using a page pool are limited in such cases.

I wonder if we can imp

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-07 Thread Jason Wang
On Wed, Jun 7, 2023 at 5:43 PM Michael S. Tsirkin  wrote:
>
> On Wed, Jun 07, 2023 at 10:39:15AM +0200, Stefano Garzarella wrote:
> > On Tue, Jun 6, 2023 at 2:58 PM Michael S. Tsirkin  wrote:
> > >
> > > On Tue, Jun 06, 2023 at 09:29:22AM +0800, Jason Wang wrote:
> > > > On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella 
> > > >  wrote:
> > > > >
> > > > > On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin wrote:
> > > > > >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella wrote:
> > > > > >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin wrote:
> > > > > >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella 
> > > > > >> > wrote:
> > > > > >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. Tsirkin 
> > > > > >> > > wrote:
> > > > > >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano Garzarella 
> > > > > >> > > > wrote:
> > > > > >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, 
> > > > > >> > > > > VHOST_SET_VRING_BASE)
> > > > > >> > > > > don't support packed virtqueue well yet, so let's filter 
> > > > > >> > > > > the
> > > > > >> > > > > VIRTIO_F_RING_PACKED feature for now in 
> > > > > >> > > > > vhost_vdpa_get_features().
> > > > > >> > > > >
> > > > > >> > > > > This way, even if the device supports it, we don't risk it 
> > > > > >> > > > > being
> > > > > >> > > > > negotiated, then the VMM is unable to set the vring state 
> > > > > >> > > > > properly.
> > > > > >> > > > >
> > > > > >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend")
> > > > > >> > > > > Cc: sta...@vger.kernel.org
> > > > > >> > > > > Signed-off-by: Stefano Garzarella 
> > > > > >> > > > > ---
> > > > > >> > > > >
> > > > > >> > > > > Notes:
> > > > > >> > > > > This patch should be applied before the "[PATCH v2 
> > > > > >> > > > > 0/3] vhost_vdpa:
> > > > > >> > > > > better PACKED support" series [1] and backported in 
> > > > > >> > > > > stable branches.
> > > > > >> > > > >
> > > > > >> > > > > We can revert it when we are sure that everything is 
> > > > > >> > > > > working with
> > > > > >> > > > > packed virtqueues.
> > > > > >> > > > >
> > > > > >> > > > > Thanks,
> > > > > >> > > > > Stefano
> > > > > >> > > > >
> > > > > >> > > > > [1] 
> > > > > >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/
> > > > > >> > > >
> > > > > >> > > > I'm a bit lost here. So why am I merging "better PACKED 
> > > > > >> > > > support" then?
> > > > > >> > >
> > > > > >> > > To really support packed virtqueue with vhost-vdpa, at that 
> > > > > >> > > point we would
> > > > > >> > > also have to revert this patch.
> > > > > >> > >
> > > > > >> > > I wasn't sure if you wanted to queue the series for this merge 
> > > > > >> > > window.
> > > > > >> > > In that case do you think it is better to send this patch only 
> > > > > >> > > for stable
> > > > > >> > > branches?
> > > > > >> > > > Does this patch make them a NOP?
> > > > > >> > >
> > > > > >> > > Yep, after applying the "better PACKED support" series and 
> > > > &g

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-08 Thread Jason Wang
On Thu, Jun 8, 2023 at 2:03 PM Michael S. Tsirkin  wrote:
>
> On Thu, Jun 08, 2023 at 08:42:15AM +0800, Jason Wang wrote:
> > On Wed, Jun 7, 2023 at 5:43 PM Michael S. Tsirkin  wrote:
> > >
> > > On Wed, Jun 07, 2023 at 10:39:15AM +0200, Stefano Garzarella wrote:
> > > > On Tue, Jun 6, 2023 at 2:58 PM Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Tue, Jun 06, 2023 at 09:29:22AM +0800, Jason Wang wrote:
> > > > > > On Mon, Jun 5, 2023 at 10:58 PM Stefano Garzarella 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, Jun 05, 2023 at 09:54:57AM -0400, Michael S. Tsirkin 
> > > > > > > wrote:
> > > > > > > >On Mon, Jun 05, 2023 at 03:30:35PM +0200, Stefano Garzarella 
> > > > > > > >wrote:
> > > > > > > >> On Mon, Jun 05, 2023 at 09:00:25AM -0400, Michael S. Tsirkin 
> > > > > > > >> wrote:
> > > > > > > >> > On Mon, Jun 05, 2023 at 02:54:20PM +0200, Stefano Garzarella 
> > > > > > > >> > wrote:
> > > > > > > >> > > On Mon, Jun 05, 2023 at 08:41:54AM -0400, Michael S. 
> > > > > > > >> > > Tsirkin wrote:
> > > > > > > >> > > > On Mon, Jun 05, 2023 at 01:06:44PM +0200, Stefano 
> > > > > > > >> > > > Garzarella wrote:
> > > > > > > >> > > > > vhost-vdpa IOCTLs (eg. VHOST_GET_VRING_BASE, 
> > > > > > > >> > > > > VHOST_SET_VRING_BASE)
> > > > > > > >> > > > > don't support packed virtqueue well yet, so let's 
> > > > > > > >> > > > > filter the
> > > > > > > >> > > > > VIRTIO_F_RING_PACKED feature for now in 
> > > > > > > >> > > > > vhost_vdpa_get_features().
> > > > > > > >> > > > >
> > > > > > > >> > > > > This way, even if the device supports it, we don't 
> > > > > > > >> > > > > risk it being
> > > > > > > >> > > > > negotiated, then the VMM is unable to set the vring 
> > > > > > > >> > > > > state properly.
> > > > > > > >> > > > >
> > > > > > > >> > > > > Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based 
> > > > > > > >> > > > > backend")
> > > > > > > >> > > > > Cc: sta...@vger.kernel.org
> > > > > > > >> > > > > Signed-off-by: Stefano Garzarella 
> > > > > > > >> > > > > ---
> > > > > > > >> > > > >
> > > > > > > >> > > > > Notes:
> > > > > > > >> > > > > This patch should be applied before the "[PATCH v2 
> > > > > > > >> > > > > 0/3] vhost_vdpa:
> > > > > > > >> > > > > better PACKED support" series [1] and backported 
> > > > > > > >> > > > > in stable branches.
> > > > > > > >> > > > >
> > > > > > > >> > > > > We can revert it when we are sure that everything 
> > > > > > > >> > > > > is working with
> > > > > > > >> > > > > packed virtqueues.
> > > > > > > >> > > > >
> > > > > > > >> > > > > Thanks,
> > > > > > > >> > > > > Stefano
> > > > > > > >> > > > >
> > > > > > > >> > > > > [1] 
> > > > > > > >> > > > > https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/
> > > > > > > >> > > >
> > > > > > > >> > > > I'm a bit lost here. So why am I merging "better PACKED 
> > > > > > > >> > > > support" then?
> > > > > > > >> > >
> > > > > > > >> > > To really support pa

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-08 Thread Jason Wang
On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella  wrote:
>
> On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote:
>
> [...]
>
> >> > > > > I have a question though, what if down the road there
> >> > > > > is a new feature that needs more changes? It will be
> >> > > > > broken too just like PACKED no?
> >> > > > > Shouldn't vdpa have an allowlist of features it knows how
> >> > > > > to support?
> >> > > >
> >> > > > It looks like we had it, but we took it out (by the way, we were
> >> > > > enabling packed even though we didn't support it):
> >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b
> >> > > >
> >> > > > The only problem I see is that for each new feature we have to modify
> >> > > > the kernel.
> >> > > > Could we have new features that don't require handling by vhost-vdpa?
> >> > > >
> >> > > > Thanks,
> >> > > > Stefano
> >> > >
> >> > > Jason what do you say to reverting this?
> >> >
> >> > I may miss something but I don't see any problem with vDPA core.
> >> >
> >> > It's the duty of the parents to advertise the features it has. For 
> >> > example,
> >> >
> >> > 1) If some kernel version that is packed is not supported via
> >> > set_vq_state, parents should not advertise PACKED features in this
> >> > case.
> >> > 2) If the kernel has support packed set_vq_state(), but it's emulated
> >> > cvq doesn't support, parents should not advertise PACKED as well
> >> >
> >> > If a parent violates the above 2, it looks like a bug of the parents.
> >> >
> >> > Thanks
> >>
> >> Yes but what about vhost_vdpa? Talking about that not the core.
> >
> >Not sure it's a good idea to workaround parent bugs via vhost-vDPA.
>
> Sorry, I'm getting lost...
> We were talking about the fact that vhost-vdpa doesn't handle
> SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before
> that series [1], no?
>
> The parents seem okay, but maybe I missed a few things.
>
> [1] 
> https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/

Yes, more below.

>
> >
> >> Should that not have a whitelist of features
> >> since it interprets ioctls differently depending on this?
> >
> >If there's a bug, it might only matter the following setup:
> >
> >SET_VRING_BASE/GET_VRING_BASE + VDUSE.
> >
> >This seems to be broken since VDUSE was introduced. If we really want
> >to backport something, it could be a fix to filter out PACKED in
> >VDUSE?
>
> mmm it doesn't seem to be a problem in VDUSE, but in vhost-vdpa.
> I think VDUSE works fine with packed virtqueue using virtio-vdpa
> (I haven't tried), so why should we filter PACKED in VDUSE?

I don't think we need any filtering since:

PACKED features has been advertised to userspace via uAPI since
6234f80574d7569444d8718355fa2838e92b158b. Once we relax in uAPI, it
would be very hard to restrict it again. For the userspace that tries
to negotiate PACKED:

1) if it doesn't use SET_VRING_BASE/GET_VRING_BASE, everything works well
2) if it uses SET_VRING_BASE/GET_VRING_BASE. it might fail or break silently

If we backport the fixes to -stable, we may break the application at
least in the case 1).

Thanks

>
> Thanks,
> Stefano
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-08 Thread Jason Wang
On Thu, Jun 8, 2023 at 5:21 PM Stefano Garzarella  wrote:
>
> On Thu, Jun 08, 2023 at 05:00:00PM +0800, Jason Wang wrote:
> >On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella  
> >wrote:
> >>
> >> On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote:
> >>
> >> [...]
> >>
> >> >> > > > > I have a question though, what if down the road there
> >> >> > > > > is a new feature that needs more changes? It will be
> >> >> > > > > broken too just like PACKED no?
> >> >> > > > > Shouldn't vdpa have an allowlist of features it knows how
> >> >> > > > > to support?
> >> >> > > >
> >> >> > > > It looks like we had it, but we took it out (by the way, we were
> >> >> > > > enabling packed even though we didn't support it):
> >> >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b
> >> >> > > >
> >> >> > > > The only problem I see is that for each new feature we have to 
> >> >> > > > modify
> >> >> > > > the kernel.
> >> >> > > > Could we have new features that don't require handling by 
> >> >> > > > vhost-vdpa?
> >> >> > > >
> >> >> > > > Thanks,
> >> >> > > > Stefano
> >> >> > >
> >> >> > > Jason what do you say to reverting this?
> >> >> >
> >> >> > I may miss something but I don't see any problem with vDPA core.
> >> >> >
> >> >> > It's the duty of the parents to advertise the features it has. For 
> >> >> > example,
> >> >> >
> >> >> > 1) If some kernel version that is packed is not supported via
> >> >> > set_vq_state, parents should not advertise PACKED features in this
> >> >> > case.
> >> >> > 2) If the kernel has support packed set_vq_state(), but it's emulated
> >> >> > cvq doesn't support, parents should not advertise PACKED as well
> >> >> >
> >> >> > If a parent violates the above 2, it looks like a bug of the parents.
> >> >> >
> >> >> > Thanks
> >> >>
> >> >> Yes but what about vhost_vdpa? Talking about that not the core.
> >> >
> >> >Not sure it's a good idea to workaround parent bugs via vhost-vDPA.
> >>
> >> Sorry, I'm getting lost...
> >> We were talking about the fact that vhost-vdpa doesn't handle
> >> SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before
> >> that series [1], no?
> >>
> >> The parents seem okay, but maybe I missed a few things.
> >>
> >> [1] 
> >> https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/
> >
> >Yes, more below.
> >
> >>
> >> >
> >> >> Should that not have a whitelist of features
> >> >> since it interprets ioctls differently depending on this?
> >> >
> >> >If there's a bug, it might only matter the following setup:
> >> >
> >> >SET_VRING_BASE/GET_VRING_BASE + VDUSE.
> >> >
> >> >This seems to be broken since VDUSE was introduced. If we really want
> >> >to backport something, it could be a fix to filter out PACKED in
> >> >VDUSE?
> >>
> >> mmm it doesn't seem to be a problem in VDUSE, but in vhost-vdpa.
> >> I think VDUSE works fine with packed virtqueue using virtio-vdpa
> >> (I haven't tried), so why should we filter PACKED in VDUSE?
> >
> >I don't think we need any filtering since:
> >
> >PACKED features has been advertised to userspace via uAPI since
> >6234f80574d7569444d8718355fa2838e92b158b. Once we relax in uAPI, it
> >would be very hard to restrict it again. For the userspace that tries
> >to negotiate PACKED:
> >
> >1) if it doesn't use SET_VRING_BASE/GET_VRING_BASE, everything works well
> >2) if it uses SET_VRING_BASE/GET_VRING_BASE. it might fail or break silently
> >
> >If we backport the fixes to -stable, we may break the application at
> >least in the case 1).
>
> Okay, I see now, thanks for the details!
>
> Maybe instead of "break silently", we can return an explicit error for
> SET_VRING_BASE/GET_VRING_BASE in stable branches.
> But if there are not many cases, we can leave it like that.

A second thought, if we need to do something for stable. is it better
if we just backport Shannon's series to stable?

>
> I was just concerned about how does the user space understand that it
> can use SET_VRING_BASE/GET_VRING_BASE for PACKED virtqueues in a given
> kernel or not.

My understanding is that if packed is advertised, the application
should assume SET/GET_VRING_BASE work.

Thanks

>
> Thanks,
> Stefano
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vhost-vdpa: filter VIRTIO_F_RING_PACKED feature

2023-06-08 Thread Jason Wang
On Thu, Jun 8, 2023 at 10:23 PM Michael S. Tsirkin  wrote:
>
> On Thu, Jun 08, 2023 at 05:29:58PM +0800, Jason Wang wrote:
> > On Thu, Jun 8, 2023 at 5:21 PM Stefano Garzarella  
> > wrote:
> > >
> > > On Thu, Jun 08, 2023 at 05:00:00PM +0800, Jason Wang wrote:
> > > >On Thu, Jun 8, 2023 at 4:00 PM Stefano Garzarella  
> > > >wrote:
> > > >>
> > > >> On Thu, Jun 08, 2023 at 03:46:00PM +0800, Jason Wang wrote:
> > > >>
> > > >> [...]
> > > >>
> > > >> >> > > > > I have a question though, what if down the road there
> > > >> >> > > > > is a new feature that needs more changes? It will be
> > > >> >> > > > > broken too just like PACKED no?
> > > >> >> > > > > Shouldn't vdpa have an allowlist of features it knows how
> > > >> >> > > > > to support?
> > > >> >> > > >
> > > >> >> > > > It looks like we had it, but we took it out (by the way, we 
> > > >> >> > > > were
> > > >> >> > > > enabling packed even though we didn't support it):
> > > >> >> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6234f80574d7569444d8718355fa2838e92b158b
> > > >> >> > > >
> > > >> >> > > > The only problem I see is that for each new feature we have 
> > > >> >> > > > to modify
> > > >> >> > > > the kernel.
> > > >> >> > > > Could we have new features that don't require handling by 
> > > >> >> > > > vhost-vdpa?
> > > >> >> > > >
> > > >> >> > > > Thanks,
> > > >> >> > > > Stefano
> > > >> >> > >
> > > >> >> > > Jason what do you say to reverting this?
> > > >> >> >
> > > >> >> > I may miss something but I don't see any problem with vDPA core.
> > > >> >> >
> > > >> >> > It's the duty of the parents to advertise the features it has. 
> > > >> >> > For example,
> > > >> >> >
> > > >> >> > 1) If some kernel version that is packed is not supported via
> > > >> >> > set_vq_state, parents should not advertise PACKED features in this
> > > >> >> > case.
> > > >> >> > 2) If the kernel has support packed set_vq_state(), but it's 
> > > >> >> > emulated
> > > >> >> > cvq doesn't support, parents should not advertise PACKED as well
> > > >> >> >
> > > >> >> > If a parent violates the above 2, it looks like a bug of the 
> > > >> >> > parents.
> > > >> >> >
> > > >> >> > Thanks
> > > >> >>
> > > >> >> Yes but what about vhost_vdpa? Talking about that not the core.
> > > >> >
> > > >> >Not sure it's a good idea to workaround parent bugs via vhost-vDPA.
> > > >>
> > > >> Sorry, I'm getting lost...
> > > >> We were talking about the fact that vhost-vdpa doesn't handle
> > > >> SET_VRING_BASE/GET_VRING_BASE ioctls well for packed virtqueue before
> > > >> that series [1], no?
> > > >>
> > > >> The parents seem okay, but maybe I missed a few things.
> > > >>
> > > >> [1] 
> > > >> https://lore.kernel.org/virtualization/20230424225031.18947-1-shannon.nel...@amd.com/
> > > >
> > > >Yes, more below.
> > > >
> > > >>
> > > >> >
> > > >> >> Should that not have a whitelist of features
> > > >> >> since it interprets ioctls differently depending on this?
> > > >> >
> > > >> >If there's a bug, it might only matter the following setup:
> > > >> >
> > > >> >SET_VRING_BASE/GET_VRING_BASE + VDUSE.
> > > >> >
> > > >> >This seems to be broken since VDUSE was introduced. If we really want
> > > >> >to backport something, it could be a fix to filter out P

Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config

2023-06-08 Thread Jason Wang
On Fri, Jun 9, 2023 at 3:45 AM Michael S. Tsirkin  wrote:
>
> On Thu, Jun 08, 2023 at 05:01:24PM +0800, Angus Chen wrote:
> > When add virtio_pci vdpa device,check the vqs number of device cap
> > and max_vq_pairs from add_config.
> > Simply starting from failing if the provisioned #qp is not
> > equal to the one that hardware has.

I think I kind of agree with Michael, I don't see any obvious
advantages to allow usersapce to configure max_vqp if it can't be
provisioned dynamically. What's wrong if we just stick the current
approach that doesn't accept max_vqp?

A better approach is to tweak the vdpa tool to display the legal
attributes that can be provisioned.

> >
> > Signed-off-by: Angus Chen 
>
> I am not sure about this one. How does userspace know
> which values are legal?

vdpa mgmtdev show can gives hints like:

max_supported_vqs 3

>
> If there's no way then maybe we should just cap the value
> to what device can support but otherwise keep the device
> working.

This seems conflict to how other drivers (like mlx5) did:

if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
if (add_config->net.max_vq_pairs > max_vqs / 2)
return -EINVAL;
max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
} else {
max_vqs = 2;
}

Thanks

>
> > ---
> > v1: Use max_vqs from add_config
> > v2: Just return fail if max_vqs from add_config is not same as device
> >   cap. Suggested by jason.
> >
> >  drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++-
> >  1 file changed, 21 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> > b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > index 281287fae89f..c1fb6963da12 100644
> > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev 
> > *v_mdev, const char *name,
> >   u64 device_features;
> >   int ret, i;
> >
> > - vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > - dev, &vp_vdpa_ops, 1, 1, name, false);
> > -
> > - if (IS_ERR(vp_vdpa)) {
> > - dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
> > - return PTR_ERR(vp_vdpa);
> > + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> > + if (add_config->net.max_vq_pairs != 
> > (v_mdev->max_supported_vqs / 2)) {
> > + dev_err(&pdev->dev, "max vqs 0x%x should be equal to 
> > 0x%x which device has\n",
> > + add_config->net.max_vq_pairs*2, 
> > v_mdev->max_supported_vqs);
> > + return -EINVAL;
> > + }
> >   }
> >
> > - vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > -
> > - vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > - vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> > - vp_vdpa->mdev = mdev;
> > -
> >   device_features = vp_modern_get_features(mdev);
> >   if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> >   if (add_config->device_features & ~device_features) {
> > - ret = -EINVAL;
> >   dev_err(&pdev->dev, "Try to provision features "
> >   "that are not supported by the device: "
> >   "device_features 0x%llx provisioned 0x%llx\n",
> >   device_features, add_config->device_features);
> > - goto err;
> > + return -EINVAL;
> >   }
> >   device_features = add_config->device_features;
> >   }
> > +
> > + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > + dev, &vp_vdpa_ops, 1, 1, name, false);
> > +
> > + if (IS_ERR(vp_vdpa)) {
> > + dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
> > + return PTR_ERR(vp_vdpa);
> > + }
> > +
> > + vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > +
> > + vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > + vp_vdpa->queues = v_mdev->max_supported_vqs;
> > + vp_vdpa->mdev = mdev;
> >   vp_vdpa->device_features = device_features;
> >
> >   ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev);
> > --
> > 2.25.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Support interrupt bypassing

2023-06-08 Thread Jason Wang
On Thu, Jun 8, 2023 at 3:01 AM Dragos Tatulea  wrote:
>
> From: Eli Cohen 
>
> Add support for generation of interrupts from the device directly to the
> VM to the VCPU thus avoiding the overhead on the host CPU.
>
> When supported, the driver will attempt to allocate vectors for each
> data virtqueue. If a vector for a virtqueue cannot be provided it will
> use the QP mode where notifications go through the driver.
>
> In addition, we add a shutdown callback to make sure allocated
> interrupts are released in case of shutdown to allow clean shutdown.
>
> Signed-off-by: Eli Cohen 
> Signed-off-by: Saeed Mahameed 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 165 --
>  drivers/vdpa/mlx5/net/mlx5_vnet.h |  15 +++
>  2 files changed, 171 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 279ac6a558d2..9138ef2fb2c8 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -83,6 +83,7 @@ struct mlx5_vq_restore_info {
> u64 driver_addr;
> u16 avail_index;
> u16 used_index;
> +   struct msi_map map;
> bool ready;
> bool restore;
>  };
> @@ -118,6 +119,7 @@ struct mlx5_vdpa_virtqueue {
> u16 avail_idx;
> u16 used_idx;
> int fw_state;
> +   struct msi_map map;
>
> /* keep last in the struct */
> struct mlx5_vq_restore_info ri;
> @@ -808,6 +810,13 @@ static bool counters_supported(const struct 
> mlx5_vdpa_dev *mvdev)
>BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
>  }
>
> +static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
> +{
> +   return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
> +   (1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
> +   pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
> +}
> +
>  static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct 
> mlx5_vdpa_virtqueue *mvq)
>  {
> int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
> @@ -849,9 +858,15 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, 
> struct mlx5_vdpa_virtque
> if (vq_is_tx(mvq->index))
> MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, 
> ndev->res.tisn);
>
> -   MLX5_SET(virtio_q, vq_ctx, event_mode, 
> MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
> +   if (mvq->map.virq) {
> +   MLX5_SET(virtio_q, vq_ctx, event_mode, 
> MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
> +   MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
> +   } else {
> +   MLX5_SET(virtio_q, vq_ctx, event_mode, 
> MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
> +   MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, 
> mvq->fwqp.mqp.qpn);
> +   }
> +
> MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
> -   MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
> MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
> MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
>  !!(ndev->mvdev.actual_features & 
> BIT_ULL(VIRTIO_F_VERSION_1)));
> @@ -1194,6 +1209,56 @@ static void counter_set_dealloc(struct mlx5_vdpa_net 
> *ndev, struct mlx5_vdpa_vir
> mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", 
> mvq->counter_set_id);
>  }
>
> +static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
> +{
> +   struct vdpa_callback *cb = priv;
> +
> +   if (cb->callback)
> +   return cb->callback(cb->private);
> +
> +   return IRQ_HANDLED;
> +}
> +
> +static void alloc_vector(struct mlx5_vdpa_net *ndev,
> +struct mlx5_vdpa_virtqueue *mvq)
> +{
> +   struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
> +   struct mlx5_vdpa_irq_pool_entry *ent;
> +   int err;
> +   int i;
> +
> +   for (i = 0; i < irqp->num_ent; i++) {
> +   ent = &irqp->entries[i];
> +   if (!ent->used) {
> +   snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, 
> "%s-vq-%d",
> +dev_name(&ndev->mvdev.vdev.dev), mvq->index);
> +   ent->dev_id = &ndev->event_cbs[mvq->index];
> +   err = request_irq(ent->map.virq, 
> mlx5_vdpa_int_handler, 0,
> + ent->name, ent->dev_id);
> +   if (err)
> + 

Re: [PATCH v2] vduse: fix NULL pointer dereference

2023-06-25 Thread Jason Wang
On Fri, Jun 23, 2023 at 4:49 AM Maxime Coquelin
 wrote:
>
> vduse_vdpa_set_vq_affinity callback can be called
> with NULL value as cpu_mask when deleting the vduse
> device.
>
> This patch resets virtqueue's IRQ affinity mask value
> to set all CPUs instead of dereferencing NULL cpu_mask.
>
> [ 4760.952149] BUG: kernel NULL pointer dereference, address: 
> [ 4760.959110] #PF: supervisor read access in kernel mode
> [ 4760.964247] #PF: error_code(0x) - not-present page
> [ 4760.969385] PGD 0 P4D 0
> [ 4760.971927] Oops:  [#1] PREEMPT SMP PTI
> [ 4760.976112] CPU: 13 PID: 2346 Comm: vdpa Not tainted 6.4.0-rc6+ #4
> [ 4760.982291] Hardware name: Dell Inc. PowerEdge R640/0W23H8, BIOS 2.8.1 
> 06/26/2020
> [ 4760.989769] RIP: 0010:memcpy_orig+0xc5/0x130
> [ 4760.994049] Code: 16 f8 4c 89 07 4c 89 4f 08 4c 89 54 17 f0 4c 89 5c 17 f8 
> c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 83 fa 08 72 1b <4c> 8b 
> 06 4c 8b 4c 16 f8 4c 89 07 4c 89 4c 17 f8 c3 cc cc cc cc 66
> [ 4761.012793] RSP: 0018:b1d565abb830 EFLAGS: 00010246
> [ 4761.018020] RAX: 9f4bf6b27898 RBX: 9f4be23969c0 RCX: 
> 9f4bcadf6400
> [ 4761.025152] RDX: 0008 RSI:  RDI: 
> 9f4bf6b27898
> [ 4761.032286] RBP:  R08: 0008 R09: 
> 
> [ 4761.039416] R10:  R11: 0600 R12: 
> 
> [ 4761.046549] R13:  R14: 0080 R15: 
> b1d565abbb10
> [ 4761.053680] FS:  7f64c2ec2740() GS:9f635f98() 
> knlGS:
> [ 4761.061765] CS:  0010 DS:  ES:  CR0: 80050033
> [ 4761.067513] CR2:  CR3: 001875270006 CR4: 
> 007706e0
> [ 4761.074645] DR0:  DR1:  DR2: 
> 
> [ 4761.081775] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [ 4761.088909] PKRU: 5554
> [ 4761.091620] Call Trace:
> [ 4761.094074]  
> [ 4761.096180]  ? __die+0x1f/0x70
> [ 4761.099238]  ? page_fault_oops+0x171/0x4f0
> [ 4761.103340]  ? exc_page_fault+0x7b/0x180
> [ 4761.107265]  ? asm_exc_page_fault+0x22/0x30
> [ 4761.111460]  ? memcpy_orig+0xc5/0x130
> [ 4761.115126]  vduse_vdpa_set_vq_affinity+0x3e/0x50 [vduse]
> [ 4761.120533]  virtnet_clean_affinity.part.0+0x3d/0x90 [virtio_net]
> [ 4761.126635]  remove_vq_common+0x1a4/0x250 [virtio_net]
> [ 4761.131781]  virtnet_remove+0x5d/0x70 [virtio_net]
> [ 4761.136580]  virtio_dev_remove+0x3a/0x90
> [ 4761.140509]  device_release_driver_internal+0x19b/0x200
> [ 4761.145742]  bus_remove_device+0xc2/0x130
> [ 4761.149755]  device_del+0x158/0x3e0
> [ 4761.153245]  ? kernfs_find_ns+0x35/0xc0
> [ 4761.157086]  device_unregister+0x13/0x60
> [ 4761.161010]  unregister_virtio_device+0x11/0x20
> [ 4761.165543]  device_release_driver_internal+0x19b/0x200
> [ 4761.170770]  bus_remove_device+0xc2/0x130
> [ 4761.174782]  device_del+0x158/0x3e0
> [ 4761.178276]  ? __pfx_vdpa_name_match+0x10/0x10 [vdpa]
> [ 4761.183336]  device_unregister+0x13/0x60
> [ 4761.187260]  vdpa_nl_cmd_dev_del_set_doit+0x63/0xe0 [vdpa]
>
> Fixes: 28f6288eb63d ("vduse: Support set_vq_affinity callback")
> Cc: xieyon...@bytedance.com
>
> Signed-off-by: Maxime Coquelin 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 5f5c21674fdc..0d84e6a9c3cc 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -726,7 +726,11 @@ static int vduse_vdpa_set_vq_affinity(struct vdpa_device 
> *vdpa, u16 idx,
>  {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
>
> -   cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
> +   if (cpu_mask)
> +   cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
> +   else
> +   cpumask_setall(&dev->vqs[idx]->irq_affinity);
> +
> return 0;
>  }
>
> --
> 2.41.0
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 00/10] virtio core prepares for AF_XDP

2023-06-25 Thread Jason Wang
On Wed, Jun 21, 2023 at 2:43 PM Xuan Zhuo  wrote:
>
> Hi Jason,
>
> Do you have plan to review this?

Just came back from vacation, will do this next week.

Thanks

>
> Thanks.
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config

2023-06-25 Thread Jason Wang
On Thu, Jun 8, 2023 at 5:02 PM Angus Chen  wrote:
>
> When add virtio_pci vdpa device,check the vqs number of device cap
> and max_vq_pairs from add_config.
> Simply starting from failing if the provisioned #qp is not
> equal to the one that hardware has.
>
> Signed-off-by: Angus Chen 
> ---
> v1: Use max_vqs from add_config
> v2: Just return fail if max_vqs from add_config is not same as device
> cap. Suggested by jason.
>
>  drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++-
>  1 file changed, 21 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> b/drivers/vdpa/virtio_pci/vp_vdpa.c
> index 281287fae89f..c1fb6963da12 100644
> --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev 
> *v_mdev, const char *name,
> u64 device_features;
> int ret, i;
>
> -   vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> -   dev, &vp_vdpa_ops, 1, 1, name, false);
> -
> -   if (IS_ERR(vp_vdpa)) {
> -   dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
> -   return PTR_ERR(vp_vdpa);
> +   if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> +   if (add_config->net.max_vq_pairs != 
> (v_mdev->max_supported_vqs / 2)) {
> +   dev_err(&pdev->dev, "max vqs 0x%x should be equal to 
> 0x%x which device has\n",
> +   add_config->net.max_vq_pairs*2, 
> v_mdev->max_supported_vqs);
> +   return -EINVAL;
> +   }
> }
>
> -   vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> -
> -   vp_vdpa->vdpa.dma_dev = &pdev->dev;
> -   vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> -   vp_vdpa->mdev = mdev;
> -
> device_features = vp_modern_get_features(mdev);
> if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> if (add_config->device_features & ~device_features) {
> -   ret = -EINVAL;
> dev_err(&pdev->dev, "Try to provision features "
> "that are not supported by the device: "
> "device_features 0x%llx provisioned 0x%llx\n",
> device_features, add_config->device_features);
> -   goto err;
> +   return -EINVAL;
> }
> device_features = add_config->device_features;
> }
> +
> +   vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> +   dev, &vp_vdpa_ops, 1, 1, name, false);
> +
> +   if (IS_ERR(vp_vdpa)) {
> +   dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n");
> +   return PTR_ERR(vp_vdpa);
> +   }
> +
> +   vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> +
> +   vp_vdpa->vdpa.dma_dev = &pdev->dev;
> +   vp_vdpa->queues = v_mdev->max_supported_vqs;

Why bother with those changes?

mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev);

Thanks


> +   vp_vdpa->mdev = mdev;
> vp_vdpa->device_features = device_features;
>
> ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev);
> --
> 2.25.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH V2 1/3] vDPA/ifcvf: dynamic allocate vq data stores

2023-06-25 Thread Jason Wang
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan  wrote:
>
> This commit dynamically allocates the data
> stores for the virtqueues based on
> virtio_pci_common_cfg.num_queues.

While at it, it's better to allocate vring_lm_cfg as well and drop
IFCVF_MAX_QUEUES.

Thanks

>
> Signed-off-by: Zhu Lingshan 
> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 3 +++
>  drivers/vdpa/ifcvf/ifcvf_base.h | 2 +-
>  drivers/vdpa/ifcvf/ifcvf_main.c | 2 ++
>  3 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index 1b5da11f5403..f86495ace825 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -134,6 +134,9 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev 
> *pdev)
> }
>
> hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues);
> +   hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, 
> GFP_KERNEL);
> +   if (!hw->vring)
> +   return -ENOMEM;
>
> for (i = 0; i < hw->nr_vring; i++) {
> vp_iowrite16(i, &hw->common_cfg->queue_select);
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index 3110ffc50caf..fa797184056b 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -74,7 +74,7 @@ struct ifcvf_hw {
> u64 dev_features;
> struct virtio_pci_common_cfg __iomem *common_cfg;
> void __iomem *dev_cfg;
> -   struct vring_info vring[IFCVF_MAX_QUEUES];
> +   struct vring_info *vring;
> void __iomem * const *base;
> char config_msix_name[256];
> struct vdpa_callback config_cb;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
> index 6e47ac2c669a..2af0de771b49 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> @@ -830,6 +830,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct 
> pci_device_id *id)
> return 0;
>
>  err:
> +   kfree(ifcvf_mgmt_dev->vf.vring);
> kfree(ifcvf_mgmt_dev);
> return ret;
>  }
> @@ -840,6 +841,7 @@ static void ifcvf_remove(struct pci_dev *pdev)
>
> ifcvf_mgmt_dev = pci_get_drvdata(pdev);
> vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev);
> +   kfree(ifcvf_mgmt_dev->vf.vring);
> kfree(ifcvf_mgmt_dev);
>  }
>
> --
> 2.39.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH V2 2/3] vDPA/ifcvf: detect and report max allowed vq size

2023-06-25 Thread Jason Wang
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan  wrote:
>
> Rather than a hardcode, this commit detects
> and reports the max value of allowed size
> of the virtqueues
>
> Signed-off-by: Zhu Lingshan 

Acked-by: Jason Wang 

Thanks

> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 31 +++
>  drivers/vdpa/ifcvf/ifcvf_base.h |  2 +-
>  drivers/vdpa/ifcvf/ifcvf_main.c |  4 +++-
>  3 files changed, 35 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index f86495ace825..f4d7d96c4c86 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -69,6 +69,37 @@ static int ifcvf_read_config_range(struct pci_dev *dev,
> return 0;
>  }
>
> +static u16 ifcvf_get_vq_size(struct ifcvf_hw *hw, u16 qid)
> +{
> +   u16 queue_size;
> +
> +   vp_iowrite16(qid, &hw->common_cfg->queue_select);
> +   queue_size = vp_ioread16(&hw->common_cfg->queue_size);
> +
> +   return queue_size;
> +}
> +
> +/* This function returns the max allowed safe size for
> + * all virtqueues. It is the minimal size that can be
> + * suppprted by all virtqueues.
> + */
> +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw)
> +{
> +   u16 queue_size, max_size, qid;
> +
> +   max_size = ifcvf_get_vq_size(hw, 0);
> +   for (qid = 1; qid < hw->nr_vring; qid++) {
> +   queue_size = ifcvf_get_vq_size(hw, qid);
> +   /* 0 means the queue is unavailable */
> +   if (!queue_size)
> +   continue;
> +
> +   max_size = min(queue_size, max_size);
> +   }
> +
> +   return max_size;
> +}
> +
>  int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
>  {
> struct virtio_pci_cap cap;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index fa797184056b..30935a95b672 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -28,7 +28,6 @@
>  #define IFCVF_MAX_QUEUES   17
>
>  #define IFCVF_QUEUE_ALIGNMENT  PAGE_SIZE
> -#define IFCVF_QUEUE_MAX32768
>  #define IFCVF_PCI_MAX_RESOURCE 6
>
>  #define IFCVF_LM_CFG_SIZE  0x40
> @@ -138,4 +137,5 @@ bool ifcvf_get_vq_ready(struct ifcvf_hw *hw, u16 qid);
>  void ifcvf_set_vq_ready(struct ifcvf_hw *hw, u16 qid, bool ready);
>  void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 features);
>  u64 ifcvf_get_driver_features(struct ifcvf_hw *hw);
> +u16 ifcvf_get_max_vq_size(struct ifcvf_hw *hw);
>  #endif /* _IFCVF_H_ */
> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
> index 2af0de771b49..c3ece395caf7 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> @@ -451,7 +451,9 @@ static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev)
>
>  static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev)
>  {
> -   return IFCVF_QUEUE_MAX;
> +   struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
> +
> +   return ifcvf_get_max_vq_size(vf);
>  }
>
>  static int ifcvf_vdpa_get_vq_state(struct vdpa_device *vdpa_dev, u16 qid,
> --
> 2.39.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH V2 3/3] vDPA/ifcvf: implement new accessors for vq_state

2023-06-25 Thread Jason Wang
On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan  wrote:
>
> This commit implements a better layout of the
> live migration bar, therefore the accessors for virtqueue
> state have been refactored.

I guess the reason for F2000X is that it can report a #vq which is
greater than IFCVF_MAX_QUEUES. If yes, let's explain it in the
changelog.

Thanks


>
> This commit also add a comment to the probing-ids list,
> indicating this driver drives F2000X-PL virtio-net
>
> Signed-off-by: Zhu Lingshan 
> ---
>  drivers/vdpa/ifcvf/ifcvf_base.c | 21 +
>  drivers/vdpa/ifcvf/ifcvf_base.h | 25 +
>  drivers/vdpa/ifcvf/ifcvf_main.c |  4 +++-
>  3 files changed, 17 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
> index f4d7d96c4c86..060f837a4f9f 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> @@ -328,30 +328,19 @@ void ifcvf_set_driver_features(struct ifcvf_hw *hw, u64 
> features)
>
>  u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
>  {
> -   struct ifcvf_lm_cfg __iomem *ifcvf_lm;
> -   void __iomem *avail_idx_addr;
> +   struct ifcvf_lm_cfg  __iomem *lm_cfg = hw->lm_cfg;
> u16 last_avail_idx;
> -   u32 q_pair_id;
>
> -   ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
> -   q_pair_id = qid / 2;
> -   avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
> -   last_avail_idx = vp_ioread16(avail_idx_addr);
> +   last_avail_idx = vp_ioread16(&lm_cfg->vq_state_region + qid * 2);
>
> return last_avail_idx;
>  }
>
>  int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
>  {
> -   struct ifcvf_lm_cfg __iomem *ifcvf_lm;
> -   void __iomem *avail_idx_addr;
> -   u32 q_pair_id;
> -
> -   ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
> -   q_pair_id = qid / 2;
> -   avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
> -   hw->vring[qid].last_avail_idx = num;
> -   vp_iowrite16(num, avail_idx_addr);
> +   struct ifcvf_lm_cfg  __iomem *lm_cfg = hw->lm_cfg;
> +
> +   vp_iowrite16(num, &lm_cfg->vq_state_region + qid * 2);
>
> return 0;
>  }
> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h
> index 30935a95b672..b57849c643f6 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> @@ -24,14 +24,9 @@
>  #define N3000_DEVICE_ID0x1041
>  #define N3000_SUBSYS_DEVICE_ID 0x001A
>
> -/* Max 8 data queue pairs(16 queues) and one control vq for now. */
> -#define IFCVF_MAX_QUEUES   17
> -
>  #define IFCVF_QUEUE_ALIGNMENT  PAGE_SIZE
>  #define IFCVF_PCI_MAX_RESOURCE 6
>
> -#define IFCVF_LM_CFG_SIZE  0x40
> -#define IFCVF_LM_RING_STATE_OFFSET 0x20
>  #define IFCVF_LM_BAR   4
>
>  #define IFCVF_ERR(pdev, fmt, ...)  dev_err(&pdev->dev, fmt, 
> ##__VA_ARGS__)
> @@ -54,10 +49,18 @@ struct vring_info {
> char msix_name[256];
>  };
>
> +struct ifcvf_lm_cfg {
> +   __le64 control;
> +   __le64 status;
> +   __le64 lm_mem_log_start_addr;
> +   __le64 lm_mem_log_end_addr;
> +   __le16 vq_state_region;
> +};
> +
>  struct ifcvf_hw {
> u8 __iomem *isr;
> /* Live migration */
> -   u8 __iomem *lm_cfg;
> +   struct ifcvf_lm_cfg  __iomem *lm_cfg;
> /* Notification bar number */
> u8 notify_bar;
> u8 msix_vector_status;
> @@ -92,16 +95,6 @@ struct ifcvf_adapter {
> struct ifcvf_hw *vf;
>  };
>
> -struct ifcvf_vring_lm_cfg {
> -   u32 idx_addr[2];
> -   u8 reserved[IFCVF_LM_CFG_SIZE - 8];
> -};
> -
> -struct ifcvf_lm_cfg {
> -   u8 reserved[IFCVF_LM_RING_STATE_OFFSET];
> -   struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES];
> -};
> -
>  struct ifcvf_vdpa_mgmt_dev {
> struct vdpa_mgmt_dev mdev;
> struct ifcvf_hw vf;
> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
> index c3ece395caf7..e98fa8100f3c 100644
> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> @@ -853,7 +853,9 @@ static struct pci_device_id ifcvf_pci_ids[] = {
>  N3000_DEVICE_ID,
>  PCI_VENDOR_ID_INTEL,
>  N3000_SUBSYS_DEVICE_ID) },
> -   /* C5000X-PL network device */
> +   /* C5000X-PL network device
> +* F2000X-PL network device
> +*/
> { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
>  VIRTIO_TRANS_ID_NET,
>  PCI_VENDOR_ID_INTEL,
> --
> 2.39.1
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH V2 1/3] vDPA/ifcvf: dynamic allocate vq data stores

2023-06-25 Thread Jason Wang
On Mon, Jun 26, 2023 at 10:38 AM Zhu, Lingshan  wrote:
>
>
>
> On 6/26/2023 10:32 AM, Jason Wang wrote:
> > On Mon, Jun 12, 2023 at 3:14 PM Zhu Lingshan  wrote:
> >> This commit dynamically allocates the data
> >> stores for the virtqueues based on
> >> virtio_pci_common_cfg.num_queues.
> > While at it, it's better to allocate vring_lm_cfg as well and drop
> > IFCVF_MAX_QUEUES.
> Yes, this has been done in 3/3 patch in this series.

Ok, yes, but it seems patch 3 implements a lot of logic so I suggest
moving it to patch 1.

Not sure it's too late since I see the patch has been merged by Michael.

Thanks

>
> Thanks
> Zhu Lingshan
> >
> > Thanks
> >
> >> Signed-off-by: Zhu Lingshan 
> >> ---
> >>   drivers/vdpa/ifcvf/ifcvf_base.c | 3 +++
> >>   drivers/vdpa/ifcvf/ifcvf_base.h | 2 +-
> >>   drivers/vdpa/ifcvf/ifcvf_main.c | 2 ++
> >>   3 files changed, 6 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c 
> >> b/drivers/vdpa/ifcvf/ifcvf_base.c
> >> index 1b5da11f5403..f86495ace825 100644
> >> --- a/drivers/vdpa/ifcvf/ifcvf_base.c
> >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.c
> >> @@ -134,6 +134,9 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev 
> >> *pdev)
> >>  }
> >>
> >>  hw->nr_vring = vp_ioread16(&hw->common_cfg->num_queues);
> >> +   hw->vring = kzalloc(sizeof(struct vring_info) * hw->nr_vring, 
> >> GFP_KERNEL);
> >> +   if (!hw->vring)
> >> +   return -ENOMEM;
> >>
> >>  for (i = 0; i < hw->nr_vring; i++) {
> >>  vp_iowrite16(i, &hw->common_cfg->queue_select);
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h 
> >> b/drivers/vdpa/ifcvf/ifcvf_base.h
> >> index 3110ffc50caf..fa797184056b 100644
> >> --- a/drivers/vdpa/ifcvf/ifcvf_base.h
> >> +++ b/drivers/vdpa/ifcvf/ifcvf_base.h
> >> @@ -74,7 +74,7 @@ struct ifcvf_hw {
> >>  u64 dev_features;
> >>  struct virtio_pci_common_cfg __iomem *common_cfg;
> >>  void __iomem *dev_cfg;
> >> -   struct vring_info vring[IFCVF_MAX_QUEUES];
> >> +   struct vring_info *vring;
> >>  void __iomem * const *base;
> >>  char config_msix_name[256];
> >>  struct vdpa_callback config_cb;
> >> diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c 
> >> b/drivers/vdpa/ifcvf/ifcvf_main.c
> >> index 6e47ac2c669a..2af0de771b49 100644
> >> --- a/drivers/vdpa/ifcvf/ifcvf_main.c
> >> +++ b/drivers/vdpa/ifcvf/ifcvf_main.c
> >> @@ -830,6 +830,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const 
> >> struct pci_device_id *id)
> >>  return 0;
> >>
> >>   err:
> >> +   kfree(ifcvf_mgmt_dev->vf.vring);
> >>  kfree(ifcvf_mgmt_dev);
> >>  return ret;
> >>   }
> >> @@ -840,6 +841,7 @@ static void ifcvf_remove(struct pci_dev *pdev)
> >>
> >>  ifcvf_mgmt_dev = pci_get_drvdata(pdev);
> >>  vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev);
> >> +   kfree(ifcvf_mgmt_dev->vf.vring);
> >>  kfree(ifcvf_mgmt_dev);
> >>   }
> >>
> >> --
> >> 2.39.1
> >>
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config

2023-06-25 Thread Jason Wang
On Mon, Jun 26, 2023 at 10:42 AM Angus Chen  wrote:
>
>
> Hi,jason.
> > -Original Message-
> > From: Jason Wang 
> > Sent: Monday, June 26, 2023 10:30 AM
> > To: Angus Chen 
> > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org;
> > linux-ker...@vger.kernel.org
> > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from
> > add_config
> >
> > On Thu, Jun 8, 2023 at 5:02 PM Angus Chen 
> > wrote:
> > >
> > > When add virtio_pci vdpa device,check the vqs number of device cap
> > > and max_vq_pairs from add_config.
> > > Simply starting from failing if the provisioned #qp is not
> > > equal to the one that hardware has.
> > >
> > > Signed-off-by: Angus Chen 
> > > ---
> > > v1: Use max_vqs from add_config
> > > v2: Just return fail if max_vqs from add_config is not same as device
> > > cap. Suggested by jason.
> > >
> > >  drivers/vdpa/virtio_pci/vp_vdpa.c | 35 ++-
> > >  1 file changed, 21 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > index 281287fae89f..c1fb6963da12 100644
> > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct
> > vdpa_mgmt_dev *v_mdev, const char *name,
> > > u64 device_features;
> > > int ret, i;
> > >
> > > -   vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > > -   dev, &vp_vdpa_ops, 1, 1, name,
> > false);
> > > -
> > > -   if (IS_ERR(vp_vdpa)) {
> > > -   dev_err(dev, "vp_vdpa: Failed to allocate vDPA
> > structure\n");
> > > -   return PTR_ERR(vp_vdpa);
> > > +   if (add_config->mask &
> > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> > > +   if (add_config->net.max_vq_pairs !=
> > (v_mdev->max_supported_vqs / 2)) {
> > > +   dev_err(&pdev->dev, "max vqs 0x%x should be
> > equal to 0x%x which device has\n",
> > > +   add_config->net.max_vq_pairs*2,
> > v_mdev->max_supported_vqs);
> > > +   return -EINVAL;
> > > +   }
> > > }
> > >
> > > -   vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > > -
> > > -   vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > > -   vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> > > -   vp_vdpa->mdev = mdev;
> > > -
> > > device_features = vp_modern_get_features(mdev);
> > > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
> > > if (add_config->device_features & ~device_features) {
> > > -   ret = -EINVAL;
> > > dev_err(&pdev->dev, "Try to provision features
> > "
> > > "that are not supported by the device:
> > "
> > > "device_features 0x%llx provisioned
> > 0x%llx\n",
> > > device_features,
> > add_config->device_features);
> > > -   goto err;
> > > +   return -EINVAL;
> > > }
> > > device_features = add_config->device_features;
> > > }
> > > +
> > > +   vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > > +   dev, &vp_vdpa_ops, 1, 1, name,
> > false);
> > > +
> > > +   if (IS_ERR(vp_vdpa)) {
> > > +   dev_err(dev, "vp_vdpa: Failed to allocate vDPA
> > structure\n");
> > > +   return PTR_ERR(vp_vdpa);
> > > +   }
> > > +
> > > +   vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > > +
> > > +   vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > > +   vp_vdpa->queues = v_mdev->max_supported_vqs;
> >
> > Why bother with those changes?
> >
> > mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev);
> max_supported_vqs will not be changed, so we can get max_supported_vqs from 
> mgtdev->max_supported_vqs.
> If we use vp_modern_get_num_queues(mdev),it will use tlp to communicate with 
> device.
> It just reduce some tlp .

Ok, but

1) I think we don't care the performance here
2) If we did, let's use a separate patch to do that as an optimization

Thanks

> >
> > Thanks
> >
> >
> > > +   vp_vdpa->mdev = mdev;
> > > vp_vdpa->device_features = device_features;
> > >
> > > ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors,
> > pdev);
> > > --
> > > 2.25.1
> > >
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from add_config

2023-06-25 Thread Jason Wang
On Mon, Jun 26, 2023 at 11:02 AM Angus Chen  wrote:
>
>
>
> > -Original Message-
> > From: Jason Wang 
> > Sent: Monday, June 26, 2023 10:51 AM
> > To: Angus Chen 
> > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org;
> > linux-ker...@vger.kernel.org
> > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device from
> > add_config
> >
> > On Mon, Jun 26, 2023 at 10:42 AM Angus Chen 
> > wrote:
> > >
> > >
> > > Hi,jason.
> > > > -Original Message-
> > > > From: Jason Wang 
> > > > Sent: Monday, June 26, 2023 10:30 AM
> > > > To: Angus Chen 
> > > > Cc: m...@redhat.com; virtualization@lists.linux-foundation.org;
> > > > linux-ker...@vger.kernel.org
> > > > Subject: Re: [PATCH v2] vdpa/vp_vdpa: Check queue number of vdpa device
> > from
> > > > add_config
> > > >
> > > > On Thu, Jun 8, 2023 at 5:02 PM Angus Chen
> > 
> > > > wrote:
> > > > >
> > > > > When add virtio_pci vdpa device,check the vqs number of device cap
> > > > > and max_vq_pairs from add_config.
> > > > > Simply starting from failing if the provisioned #qp is not
> > > > > equal to the one that hardware has.
> > > > >
> > > > > Signed-off-by: Angus Chen 
> > > > > ---
> > > > > v1: Use max_vqs from add_config
> > > > > v2: Just return fail if max_vqs from add_config is not same as device
> > > > > cap. Suggested by jason.
> > > > >
> > > > >  drivers/vdpa/virtio_pci/vp_vdpa.c | 35 
> > > > > ++-
> > > > >  1 file changed, 21 insertions(+), 14 deletions(-)
> > > > >
> > > > > diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > > b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > > > index 281287fae89f..c1fb6963da12 100644
> > > > > --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > > > +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> > > > > @@ -480,32 +480,39 @@ static int vp_vdpa_dev_add(struct
> > > > vdpa_mgmt_dev *v_mdev, const char *name,
> > > > > u64 device_features;
> > > > > int ret, i;
> > > > >
> > > > > -   vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa,
> > > > > -   dev, &vp_vdpa_ops, 1, 1,
> > name,
> > > > false);
> > > > > -
> > > > > -   if (IS_ERR(vp_vdpa)) {
> > > > > -   dev_err(dev, "vp_vdpa: Failed to allocate vDPA
> > > > structure\n");
> > > > > -   return PTR_ERR(vp_vdpa);
> > > > > +   if (add_config->mask &
> > > > BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
> > > > > +   if (add_config->net.max_vq_pairs !=
> > > > (v_mdev->max_supported_vqs / 2)) {
> > > > > +   dev_err(&pdev->dev, "max vqs 0x%x should
> > be
> > > > equal to 0x%x which device has\n",
> > > > > +   add_config->net.max_vq_pairs*2,
> > > > v_mdev->max_supported_vqs);
> > > > > +   return -EINVAL;
> > > > > +   }
> > > > > }
> > > > >
> > > > > -   vp_vdpa_mgtdev->vp_vdpa = vp_vdpa;
> > > > > -
> > > > > -   vp_vdpa->vdpa.dma_dev = &pdev->dev;
> > > > > -   vp_vdpa->queues = vp_modern_get_num_queues(mdev);
> > > > > -   vp_vdpa->mdev = mdev;
> > > > > -
> > > > > device_features = vp_modern_get_features(mdev);
> > > > > if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES))
> > {
> > > > > if (add_config->device_features & ~device_features) {
> > > > > -   ret = -EINVAL;
> > > > > dev_err(&pdev->dev, "Try to provision
> > features
> > > > "
> > > > > "that are not supported by the
> > device:
> > > > "
> > > > > "device_features 0x%llx
> > provisioned
> > > > 0x%llx\n",
> > >

Re: [PATCH vhost v10 01/10] virtio_ring: put mapping error check in vring_map_one_sg

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> This patch put the dma addr error check in vring_map_one_sg().
>
> The benefits of doing this:
>
> 1. reduce one judgment of vq->use_dma_api.
> 2. make vring_map_one_sg more simple, without calling
>vring_mapping_error to check the return value. simplifies subsequent
>code
>
> Signed-off-by: Xuan Zhuo 

Acked-by: Jason Wang 

Thanks


> ---
>  drivers/virtio/virtio_ring.c | 37 +---
>  1 file changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c5310eaf8b46..72ed07a604d4 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -355,9 +355,8 @@ static struct device *vring_dma_dev(const struct 
> vring_virtqueue *vq)
>  }
>
>  /* Map one sg entry. */
> -static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
> -  struct scatterlist *sg,
> -  enum dma_data_direction direction)
> +static int vring_map_one_sg(const struct vring_virtqueue *vq, struct 
> scatterlist *sg,
> +   enum dma_data_direction direction, dma_addr_t 
> *addr)
>  {
> if (!vq->use_dma_api) {
> /*
> @@ -366,7 +365,8 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * depending on the direction.
>  */
> kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, 
> direction);
> -   return (dma_addr_t)sg_phys(sg);
> +   *addr = (dma_addr_t)sg_phys(sg);
> +   return 0;
> }
>
> /*
> @@ -374,9 +374,14 @@ static dma_addr_t vring_map_one_sg(const struct 
> vring_virtqueue *vq,
>  * the way it expects (we don't guarantee that the scatterlist
>  * will exist for the lifetime of the mapping).
>  */
> -   return dma_map_page(vring_dma_dev(vq),
> +   *addr = dma_map_page(vring_dma_dev(vq),
> sg_page(sg), sg->offset, sg->length,
> direction);
> +
> +   if (dma_mapping_error(vring_dma_dev(vq), *addr))
> +   return -ENOMEM;
> +
> +   return 0;
>  }
>
>  static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> @@ -588,8 +593,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
>
> for (n = 0; n < out_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_TO_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -603,8 +609,9 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> }
> for (; n < (out_sgs + in_sgs); n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, 
> DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   dma_addr_t addr;
> +
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> goto unmap_release;
>
> prev = i;
> @@ -1279,9 +1286,8 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -   if (vring_mapping_error(vq, addr))
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> goto unmap_release;
>
> desc[i].flags = cpu_to_le16(n < out_sgs ?
> @@ -1426,9 +1432,10 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> c = 0;
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   dma_addr_t addr = vring_map_one_sg(vq, sg, n < 
> out_sgs ?
> -   DMA_TO_DEVICE : DMA_FROM_DEVICE);
> -  

Re: [PATCH vhost v10 02/10] virtio_ring: introduce virtqueue_set_premapped()

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> This helper allows the driver change the dma mode to premapped mode.
> Under the premapped mode, the virtio core do not do dma mapping
> internally.
>
> This just work when the use_dma_api is true. If the use_dma_api is false,
> the dma options is not through the DMA APIs, that is not the standard
> way of the linux kernel.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 40 
>  include/linux/virtio.h   |  2 ++
>  2 files changed, 42 insertions(+)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 72ed07a604d4..2afdfb9e3e30 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -172,6 +172,9 @@ struct vring_virtqueue {
> /* Host publishes avail event idx */
> bool event;
>
> +   /* Do DMA mapping by driver */
> +   bool premapped;
> +
> /* Head of free buffer list. */
> unsigned int free_head;
> /* Number we've added since last sync. */
> @@ -2059,6 +2062,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
> vq->packed_ring = true;
> vq->dma_dev = dma_dev;
> vq->use_dma_api = vring_use_dma_api(vdev);
> +   vq->premapped = false;
>
> vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) 
> &&
> !context;
> @@ -2548,6 +2552,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned 
> int index,
>  #endif
> vq->dma_dev = dma_dev;
> vq->use_dma_api = vring_use_dma_api(vdev);
> +   vq->premapped = false;
>
> vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) 
> &&
> !context;
> @@ -2691,6 +2696,41 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num,
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_resize);
>
> +/**
> + * virtqueue_set_premapped - set the vring premapped mode
> + * @_vq: the struct virtqueue we're talking about.
> + *
> + * Enable the premapped mode of the vq.
> + *
> + * The vring in premapped mode does not do dma internally, so the driver must
> + * do dma mapping in advance. The driver must pass the dma_address through
> + * dma_address of scatterlist. When the driver got a used buffer from
> + * the vring, it has to unmap the dma address. So the driver must call
> + * virtqueue_get_buf_premapped()/virtqueue_detach_unused_buf_premapped().
> + *
> + * This must be called before adding any buf to vring.

And any old buffer should be detached?

> + * So this should be called immediately after init vq or vq reset.

Any way to detect and warn in this case? (not a must if it's too
expensive to do the check)

> + *
> + * Caller must ensure we don't call this with other virtqueue operations
> + * at the same time (except where noted).
> + *
> + * Returns zero or a negative error.
> + * 0: success.
> + * -EINVAL: vring does not use the dma api, so we can not enable premapped 
> mode.
> + */
> +int virtqueue_set_premapped(struct virtqueue *_vq)
> +{
> +   struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +   if (!vq->use_dma_api)
> +   return -EINVAL;
> +
> +   vq->premapped = true;

I guess there should be a way to disable it. Would it be useful for
the case when AF_XDP sockets were destroyed?

Thanks


> +
> +   return 0;
> +}
> +EXPORT_SYMBOL_GPL(virtqueue_set_premapped);
> +
>  /* Only available for split ring */
>  struct virtqueue *vring_new_virtqueue(unsigned int index,
>   unsigned int num,
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index b93238db94e3..1fc0e1023bd4 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -78,6 +78,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
>
>  unsigned virtqueue_enable_cb_prepare(struct virtqueue *vq);
>
> +int virtqueue_set_premapped(struct virtqueue *_vq);
> +
>  bool virtqueue_poll(struct virtqueue *vq, unsigned);
>
>  bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 03/10] virtio_ring: split: support add premapped buf

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> If the vq is the premapped mode, use the sg_dma_address() directly.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 46 ++--
>  1 file changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 2afdfb9e3e30..18212c3e056b 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -598,8 +598,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, 
> &addr))
> +   goto unmap_release;
> +   }

Btw, I wonder whether or not it would be simple to implement the
vq->premapped check inside vring_map_one_sg() assuming the
!use_dma_api is done there as well.

>
> prev = i;
> /* Note that we trust indirect descriptor
> @@ -614,8 +618,12 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, 
> &addr))
> +   goto unmap_release;
> +   }
>
> prev = i;
> /* Note that we trust indirect descriptor
> @@ -689,21 +697,23 @@ static inline int virtqueue_add_split(struct virtqueue 
> *_vq,
> return 0;
>
>  unmap_release:
> -   err_idx = i;
> +   if (!vq->premapped) {

Can vq->premapped be true here? The label is named as "unmap_relase"
which implies "map" beforehand which seems not the case for
premapping.

Thanks


> +   err_idx = i;
>
> -   if (indirect)
> -   i = 0;
> -   else
> -   i = head;
> -
> -   for (n = 0; n < total_sg; n++) {
> -   if (i == err_idx)
> -   break;
> -   if (indirect) {
> -   vring_unmap_one_split_indirect(vq, &desc[i]);
> -   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> -   } else
> -   i = vring_unmap_one_split(vq, i);
> +   if (indirect)
> +   i = 0;
> +   else
> +   i = head;
> +
> +   for (n = 0; n < total_sg; n++) {
> +   if (i == err_idx)
> +   break;
> +   if (indirect) {
> +   vring_unmap_one_split_indirect(vq, &desc[i]);
> +   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
> +   } else
> +   i = vring_unmap_one_split(vq, i);
> +   }
> }
>
> if (indirect)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 04/10] virtio_ring: packed: support add premapped buf

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> If the vq is the premapped mode, use the sg_dma_address() directly.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 36 ++--
>  1 file changed, 26 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 18212c3e056b..dc109fbc05a5 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1299,9 +1299,13 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
>
> for (n = 0; n < out_sgs + in_sgs; n++) {
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> -DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : 
> DMA_FROM_DEVICE, &addr))
> +   goto unmap_release;
> +   }
>
> desc[i].flags = cpu_to_le16(n < out_sgs ?
> 0 : VRING_DESC_F_WRITE);
> @@ -1369,10 +1373,12 @@ static int virtqueue_add_indirect_packed(struct 
> vring_virtqueue *vq,
> return 0;
>
>  unmap_release:
> -   err_idx = i;
> +   if (!vq->premapped) {
> +   err_idx = i;
>
> -   for (i = 0; i < err_idx; i++)
> -   vring_unmap_desc_packed(vq, &desc[i]);
> +   for (i = 0; i < err_idx; i++)
> +   vring_unmap_desc_packed(vq, &desc[i]);
> +   }
>
> kfree(desc);
>
> @@ -1447,9 +1453,13 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> dma_addr_t addr;
>
> -   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> -DMA_TO_DEVICE : DMA_FROM_DEVICE, 
> &addr))
> -   goto unmap_release;
> +   if (vq->premapped) {
> +   addr = sg_dma_address(sg);
> +   } else {
> +   if (vring_map_one_sg(vq, sg, n < out_sgs ?
> +DMA_TO_DEVICE : 
> DMA_FROM_DEVICE, &addr))
> +   goto unmap_release;
> +   }
>
> flags = cpu_to_le16(vq->packed.avail_used_flags |
> (++c == total_sg ? 0 : VRING_DESC_F_NEXT) 
> |
> @@ -1512,11 +1522,17 @@ static inline int virtqueue_add_packed(struct 
> virtqueue *_vq,
> return 0;
>
>  unmap_release:
> +   vq->packed.avail_used_flags = avail_used_flags;
> +
> +   if (vq->premapped) {

Similar to the split path, I think we can't hit vq->premapped here.

Thanks


> +   END_USE(vq);
> +   return -EIO;
> +   }
> +
> err_idx = i;
> i = head;
> curr = vq->free_head;
>
> -   vq->packed.avail_used_flags = avail_used_flags;
>
> for (n = 0; n < total_sg; n++) {
> if (i == err_idx)
> --
> 2.32.0.3.g01195cf9f
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost v10 05/10] virtio_ring: split-detach: support return dma info to driver

2023-06-27 Thread Jason Wang
On Fri, Jun 2, 2023 at 5:22 PM Xuan Zhuo  wrote:
>
> Under the premapped mode, the driver needs to unmap the DMA address
> after receiving the buffer. The virtio core records the DMA address,
> so the driver needs a way to get the dma info from the virtio core.

A second thought, can we simply offload the tracking to the driver
itself? This looks the way many other modern NIC drivers did.

In pre mapped mode, the DMA address is in fact told by the driver
itself so it should have sufficient knowledge. And in some cases, the
driver wants to optimize/merge/delay the unampping so the DMA
addresses returned by the virtio core are not even interested in those
cases.

Thanks



>
> A straightforward approach is to pass an array to the virtio core when
> calling virtqueue_get_buf(). However, it is not feasible when there are
> multiple DMA addresses in the descriptor chain, and the array size is
> unknown.
>
> To solve this problem, a helper be introduced. After calling
> virtqueue_get_buf(), the driver can call the helper to
> retrieve a dma info. If the helper function returns -EAGAIN, it means
> that there are more DMA addresses to be processed, and the driver should
> call the helper function again. To keep track of the current position in
> the chain, a cursor must be passed to the helper function, which is
> initialized by virtqueue_get_buf().
>
> Some processes are done inside this helper, so this helper MUST be
> called under the premapped mode.
>
> Signed-off-by: Xuan Zhuo 
> ---
>  drivers/virtio/virtio_ring.c | 118 ---
>  include/linux/virtio.h   |  11 
>  2 files changed, 119 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index dc109fbc05a5..cdc4349f6066 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -754,8 +754,95 @@ static bool virtqueue_kick_prepare_split(struct 
> virtqueue *_vq)
> return needs_kick;
>  }
>
> -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> -void **ctx)
> +static void detach_cursor_init_split(struct vring_virtqueue *vq,
> +struct virtqueue_detach_cursor *cursor, 
> u16 head)
> +{
> +   struct vring_desc_extra *extra;
> +
> +   extra = &vq->split.desc_extra[head];
> +
> +   /* Clear data ptr. */
> +   vq->split.desc_state[head].data = NULL;
> +
> +   cursor->head = head;
> +   cursor->done = 0;
> +
> +   if (extra->flags & VRING_DESC_F_INDIRECT) {
> +   cursor->num = extra->len / sizeof(struct vring_desc);
> +   cursor->indirect = true;
> +   cursor->pos = 0;
> +
> +   vring_unmap_one_split(vq, head);
> +
> +   extra->next = vq->free_head;
> +
> +   vq->free_head = head;
> +
> +   /* Plus final descriptor */
> +   vq->vq.num_free++;
> +
> +   } else {
> +   cursor->indirect = false;
> +   cursor->pos = head;
> +   }
> +}
> +
> +static int virtqueue_detach_split(struct virtqueue *_vq, struct 
> virtqueue_detach_cursor *cursor,
> + dma_addr_t *addr, u32 *len, enum 
> dma_data_direction *dir)
> +{
> +   struct vring_virtqueue *vq = to_vvq(_vq);
> +   __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> +   int rc = -EAGAIN;
> +
> +   if (unlikely(cursor->done))
> +   return -EINVAL;
> +
> +   if (!cursor->indirect) {
> +   struct vring_desc_extra *extra;
> +   unsigned int i;
> +
> +   i = cursor->pos;
> +
> +   extra = &vq->split.desc_extra[i];
> +
> +   if (vq->split.vring.desc[i].flags & nextflag) {
> +   cursor->pos = extra->next;
> +   } else {
> +   extra->next = vq->free_head;
> +   vq->free_head = cursor->head;
> +   cursor->done = true;
> +   rc = 0;
> +   }
> +
> +   *addr = extra->addr;
> +   *len = extra->len;
> +   *dir = (extra->flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE 
> : DMA_TO_DEVICE;
> +
> +   vq->vq.num_free++;
> +
> +   } else {
> +   struct vring_desc *indir_desc, *desc;
> +   u16 flags;
> +
> +   indir_desc = vq->split.desc_state[cursor->head].indir_desc;
> +   desc = &indir_desc[cursor->pos];
> +
> +   flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +   *addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> +   *len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> +   *dir = (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : 
> DMA_TO_DEVICE;
> +
> +   if (++cursor->pos == cursor->num) {
> +   kfree(indir_desc);
> +  

  1   2   3   4   5   6   7   8   9   10   >