RE: [RFC] eal: add seqlock

2022-03-27 Thread Ananyev, Konstantin
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index 9700494816..48df5f1a21 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -36,6 +36,7 @@ headers += files(
>  'rte_per_lcore.h',
>  'rte_random.h',
>  'rte_reciprocal.h',
> +'rte_seqlock.h',
>  'rte_service.h',
>  'rte_service_component.h',
>  'rte_string_fns.h',
> diff --git a/lib/eal/include/rte_seqlock.h b/lib/eal/include/rte_seqlock.h
> new file mode 100644
> index 00..b975ca848a
> --- /dev/null
> +++ b/lib/eal/include/rte_seqlock.h
> @@ -0,0 +1,84 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Ericsson AB
> + */
> +
> +#ifndef _RTE_SEQLOCK_H_
> +#define _RTE_SEQLOCK_H_
> +
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +
> +struct rte_seqlock {
> + uint64_t sn;
> + rte_spinlock_t lock;
> +};
> +
> +typedef struct rte_seqlock rte_seqlock_t;
> +
> +__rte_experimental
> +void
> +rte_seqlock_init(rte_seqlock_t *seqlock);

Probably worth to have static initializer too.


> +
> +__rte_experimental
> +static inline uint64_t
> +rte_seqlock_read_begin(const rte_seqlock_t *seqlock)
> +{
> + /* __ATOMIC_ACQUIRE to prevent loads after (in program order)
> +  * from happening before the sn load. Syncronizes-with the
> +  * store release in rte_seqlock_end().
> +  */
> + return __atomic_load_n(&seqlock->sn, __ATOMIC_ACQUIRE);
> +}
> +
> +__rte_experimental
> +static inline bool
> +rte_seqlock_read_retry(const rte_seqlock_t *seqlock, uint64_t begin_sn)
> +{
> + uint64_t end_sn;
> +
> + /* make sure the data loads happens before the sn load */
> + rte_atomic_thread_fence(__ATOMIC_ACQUIRE);

That's sort of 'read_end' correct?
If so, shouldn't it be '__ATOMIC_RELEASE' instead here,
and
end_sn = __atomic_load_n(..., (__ATOMIC_ACQUIRE)
on the line below? 

> +
> + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED);
> +
> + return unlikely(begin_sn & 1 || begin_sn != end_sn);
> +}
> +
> +__rte_experimental
> +static inline void
> +rte_seqlock_write_begin(rte_seqlock_t *seqlock)
> +{
> + uint64_t sn;
> +
> + /* to synchronize with other writers */
> + rte_spinlock_lock(&seqlock->lock);
> +
> + sn = seqlock->sn + 1;
> +
> + __atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELAXED);
> +
> + /* __ATOMIC_RELEASE to prevent stores after (in program order)
> +  * from happening before the sn store.
> +  */
> + rte_atomic_thread_fence(__ATOMIC_RELEASE);

I think it needs to be '__ATOMIC_ACQUIRE' here instead of '__ATOMIC_RELEASE'.

> +}
> +
> +__rte_experimental
> +static inline void
> +rte_seqlock_write_end(rte_seqlock_t *seqlock)
> +{
> + uint64_t sn;
> +
> + sn = seqlock->sn + 1;
> +
> + /* synchronizes-with the load acquire in rte_seqlock_begin() */
> + __atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELEASE);
> +
> + rte_spinlock_unlock(&seqlock->lock);
> +}
> +



[PATCH v4 00/16] add virtio_blk device support to vdpa/ifc

2022-03-27 Thread Andy Pei
This patch set add virtio_blk device support to vdpa/ifc driver.
With a lot of similarities, I re-use part of vdpa/ifc driver.
Distinguish the virtio net and blk device by device id, and implement 
specific features and ops.
Add example to vdpa to support virtio_blk device.
To support blk device live migration, some modification to vhost lib.
Perform dev_conf op only under VHOST_USER_SET_VRING_CALL msg.

v4:
 add args "isblk" to vdpa example to specify a block device, fix some
 issue in example.
 Make sure code specify for block device does not affect net device.
v3:
 Fix some compile issues.
v2:
  Fix some coding style issues.

Andy Pei (16):
  vdpa/ifc: add support for virtio blk device
  vhost: add vdpa ops for blk device
  vhost: add support for VHOST_USER_GET_CONFIG and VHOST_USER_SET_CONFIG
  vdpa/ifc: add blk ops for ifc device
  vdpa/ifc: add vdpa interrupt for blk device
  vdpa/ifc: add block device SW live-migration
  example/vdpa:add vdpa blk support in example
  usertools: add support for virtio blk device
  vdpa/ifc: set_vring_state op is mandatory, add set_vring_state for blk
device
  vdpa/ifc: add some log at VDPA lauch before qemu connect
  vdpa/ifc: read virtio max_queues from hardware
  vdpa: add config space change interrupt register and handle for
virtio_blk
  vdpa/ifc: add is_blk flag to ifcvf_hw, and init is_blk during probe
  vdpa/ifc/base: for blk device, live migration register is different
from net device
  vdpa/ifc: make sure hardware last_avail_idx and last_used_idx is the
same when blk device pause
  vhost: make sure each queue callfd is configured

 drivers/vdpa/ifc/base/ifcvf.c|  42 +++-
 drivers/vdpa/ifc/base/ifcvf.h|  29 ++-
 drivers/vdpa/ifc/ifcvf_vdpa.c| 523 ---
 examples/vdpa/main.c |  61 -
 examples/vdpa/vdpa_blk_compact.h |  72 ++
 examples/vdpa/vhost_user.h   | 169 +
 lib/vhost/vdpa_driver.h  |   8 +-
 lib/vhost/vhost_user.c   |  65 +
 lib/vhost/vhost_user.h   |  15 ++
 usertools/dpdk-devbind.py|   8 +
 10 files changed, 937 insertions(+), 55 deletions(-)
 create mode 100644 examples/vdpa/vdpa_blk_compact.h
 create mode 100644 examples/vdpa/vhost_user.h

-- 
1.8.3.1



[PATCH v4 01/16] vdpa/ifc: add support for virtio blk device

2022-03-27 Thread Andy Pei
Re-use the vdpa/ifc code, distinguish blk and net device by pci_device_id.
Blk and net device are implemented with proper feature and ops.

Signed-off-by: Andy Pei 
Reviewed-by: Maxime Coquelin 
---
 drivers/vdpa/ifc/base/ifcvf.h | 16 +++-
 drivers/vdpa/ifc/ifcvf_vdpa.c | 92 +++
 2 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 573a35f..01522c6 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -5,8 +5,17 @@
 #ifndef _IFCVF_H_
 #define _IFCVF_H_
 
+#include 
 #include "ifcvf_osdep.h"
 
+#define IFCVF_NET  0
+#define IFCVF_BLK  1
+
+/* for BLK */
+#define IFCVF_BLK_TRANSITIONAL_DEVICE_ID0x1001
+#define IFCVF_BLK_MODERN_DEVICE_ID  0x1042
+#define IFCVF_BLK_DEVICE_ID 0x0002
+
 #define IFCVF_VENDOR_ID0x1AF4
 #define IFCVF_DEVICE_ID0x1041
 #define IFCVF_SUBSYS_VENDOR_ID 0x8086
@@ -57,7 +66,6 @@
 
 #define IFCVF_32_BIT_MASK  0x
 
-
 struct ifcvf_pci_cap {
u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */
u8 cap_next;/* Generic PCI field: next ptr. */
@@ -126,7 +134,11 @@ struct ifcvf_hw {
u8 notify_region;
u32notify_off_multiplier;
struct ifcvf_pci_common_cfg *common_cfg;
-   struct ifcvf_net_config *dev_cfg;
+   union {
+   struct ifcvf_net_config *net_cfg;
+   struct virtio_blk_config *blk_cfg;
+   void *dev_cfg;
+   };
u8 *isr;
u16*notify_base;
u16*notify_addr[IFCVF_MAX_QUEUES * 2];
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 9f05595..e3210a8 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -58,6 +58,7 @@ struct ifcvf_internal {
struct rte_vdpa_device *vdev;
uint16_t max_queues;
uint64_t features;
+   int device_type;
rte_atomic32_t started;
rte_atomic32_t dev_attached;
rte_atomic32_t running;
@@ -75,6 +76,12 @@ struct internal_list {
struct ifcvf_internal *internal;
 };
 
+/* vdpa device info includes device features and devcic operation. */
+struct rte_vdpa_dev_info {
+   uint64_t features;
+   struct rte_vdpa_dev_ops *ops;
+};
+
 TAILQ_HEAD(internal_list_head, internal_list);
 static struct internal_list_head internal_list =
TAILQ_HEAD_INITIALIZER(internal_list);
@@ -1167,6 +1174,48 @@ struct internal_list {
return 0;
 }
 
+static int16_t
+ifcvf_pci_get_device_type(struct rte_pci_device *pci_dev)
+{
+   uint16_t pci_device_id = pci_dev->id.device_id;
+   uint16_t device_id;
+
+   if (pci_device_id < 0x1000 || pci_device_id > 0x107f) {
+   DRV_LOG(ERR, "Probe device is not a virtio device\n");
+   return -1;
+   }
+
+   if (pci_device_id < 0x1040) {
+   /* Transitional devices: use the PCI subsystem device id as
+* virtio device id, same as legacy driver always did.
+*/
+   device_id = pci_dev->id.subsystem_device_id;
+   } else {
+   /* Modern devices: simply use PCI device id,
+* but start from 0x1040.
+*/
+   device_id = pci_device_id - 0x1040;
+   }
+
+   return device_id;
+}
+
+struct rte_vdpa_dev_info dev_info[] = {
+   {
+   .features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
+   (1ULL << VIRTIO_NET_F_CTRL_VQ) |
+   (1ULL << VIRTIO_NET_F_STATUS) |
+   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
+   (1ULL << VHOST_F_LOG_ALL),
+   .ops = &ifcvf_ops,
+   },
+   {
+   .features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
+   (1ULL << VHOST_F_LOG_ALL),
+   .ops = NULL,
+   },
+};
+
 static int
 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
struct rte_pci_device *pci_dev)
@@ -1178,6 +1227,7 @@ struct internal_list {
int sw_fallback_lm = 0;
struct rte_kvargs *kvlist = NULL;
int ret = 0;
+   int16_t device_id;
 
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return 0;
@@ -1227,13 +1277,24 @@ struct internal_list {
internal->configured = 0;
internal->max_queues = IFCVF_MAX_QUEUES;
features = ifcvf_get_features(&internal->hw);
-   internal->features = (features &
-   ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
-   (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
-   (1ULL << VIRTIO_NET_F_CTRL_VQ) |
-   (1ULL << VIRTIO_NET_F_STATUS) |
-   (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
-   (1ULL << VHOST_F_LOG_ALL);
+
+   device_id = ifcvf_pci_get_devi

[PATCH v4 02/16] vhost: add vdpa ops for blk device

2022-03-27 Thread Andy Pei
Get_config and set_config are necessary ops for blk device.
Add get_config and set_config ops to vdpa ops.

Signed-off-by: Andy Pei 
Reviewed-by: Maxime Coquelin 
---
 lib/vhost/vdpa_driver.h | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/vhost/vdpa_driver.h b/lib/vhost/vdpa_driver.h
index 88138be..e59a834 100644
--- a/lib/vhost/vdpa_driver.h
+++ b/lib/vhost/vdpa_driver.h
@@ -72,8 +72,12 @@ struct rte_vdpa_dev_ops {
/** Reset statistics of the queue */
int (*reset_stats)(struct rte_vdpa_device *dev, int qid);
 
-   /** Reserved for future extension */
-   void *reserved[2];
+   /** Get the device configuration space */
+   int (*get_config)(int vid, uint8_t *config, uint32_t len);
+
+   /** Set the device configuration space */
+   int (*set_config)(int vid, uint8_t *config, uint32_t offset,
+ uint32_t size, uint32_t flags);
 };
 
 /**
-- 
1.8.3.1



[PATCH v4 03/16] vhost: add support for VHOST_USER_GET_CONFIG and VHOST_USER_SET_CONFIG

2022-03-27 Thread Andy Pei
Add support for VHOST_USER_GET_CONFIG and VHOST_USER_SET_CONFIG.
VHOST_USER_GET_CONFIG and VHOST_USER_SET_CONFIG message is only
supported by virtio blk VDPA device.

Signed-off-by: Andy Pei 
---
 lib/vhost/vhost_user.c | 50 ++
 lib/vhost/vhost_user.h | 15 +++
 2 files changed, 65 insertions(+)

diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index 1d39067..b11fafd 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -80,6 +80,8 @@
[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
[VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
[VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
+   [VHOST_USER_GET_CONFIG]  = "VHOST_USER_GET_CONFIG",
+   [VHOST_USER_SET_CONFIG]  = "VHOST_USER_SET_CONFIG",
[VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
[VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
[VHOST_USER_POSTCOPY_ADVISE]  = "VHOST_USER_POSTCOPY_ADVISE",
@@ -2542,6 +2544,52 @@ static int is_vring_iotlb(struct virtio_net *dev,
 }
 
 static int
+vhost_user_get_config(struct virtio_net **pdev,
+   struct vhu_msg_context *ctx,
+   int main_fd __rte_unused)
+{
+   struct virtio_net *dev = *pdev;
+   struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
+   int ret = 0;
+
+   if (vdpa_dev->ops->get_config) {
+   ret = vdpa_dev->ops->get_config(dev->vid,
+  ctx->msg.payload.cfg.region,
+  ctx->msg.payload.cfg.size);
+   if (ret != 0) {
+   ctx->msg.size = 0;
+   VHOST_LOG_CONFIG(ERR, "get_config() return error!\n");
+   }
+   } else {
+   VHOST_LOG_CONFIG(ERR, "get_config() not supportted!\n");
+   }
+
+   return RTE_VHOST_MSG_RESULT_REPLY;
+}
+
+static int
+vhost_user_set_config(struct virtio_net **pdev,
+   struct vhu_msg_context *ctx,
+   int main_fd __rte_unused)
+{
+   struct virtio_net *dev = *pdev;
+   struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
+   int ret = 0;
+
+   if (vdpa_dev->ops->set_config) {
+   ret = vdpa_dev->ops->set_config(dev->vid,
+   ctx->msg.payload.cfg.region,
+   ctx->msg.payload.cfg.offset,
+   ctx->msg.payload.cfg.size,
+   ctx->msg.payload.cfg.flags);
+   } else {
+   VHOST_LOG_CONFIG(ERR, "set_config() not supportted!\n");
+   }
+
+   return ret == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR;
+}
+
+static int
 vhost_user_iotlb_msg(struct virtio_net **pdev,
struct vhu_msg_context *ctx,
int main_fd __rte_unused)
@@ -2782,6 +2830,8 @@ typedef int (*vhost_message_handler_t)(struct virtio_net 
**pdev,
[VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
[VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
[VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
+   [VHOST_USER_GET_CONFIG] = vhost_user_get_config,
+   [VHOST_USER_SET_CONFIG] = vhost_user_set_config,
[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
diff --git a/lib/vhost/vhost_user.h b/lib/vhost/vhost_user.h
index c946cc2..d3f014e 100644
--- a/lib/vhost/vhost_user.h
+++ b/lib/vhost/vhost_user.h
@@ -50,6 +50,8 @@
VHOST_USER_NET_SET_MTU = 20,
VHOST_USER_SET_SLAVE_REQ_FD = 21,
VHOST_USER_IOTLB_MSG = 22,
+   VHOST_USER_GET_CONFIG = 24,
+   VHOST_USER_SET_CONFIG = 25,
VHOST_USER_CRYPTO_CREATE_SESS = 26,
VHOST_USER_CRYPTO_CLOSE_SESS = 27,
VHOST_USER_POSTCOPY_ADVISE = 28,
@@ -125,6 +127,18 @@
uint16_t queue_size;
 } VhostUserInflight;
 
+#ifndef VHOST_USER_MAX_CONFIG_SIZE
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+#endif
+
+/** Get/set config msg payload */
+struct vhost_user_config {
+   uint32_t offset;
+   uint32_t size;
+   uint32_t flags;
+   uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+};
+
 typedef struct VhostUserMsg {
union {
uint32_t master; /* a VhostUserRequest value */
@@ -148,6 +162,7 @@
VhostUserCryptoSessionParam crypto_session;
VhostUserVringArea area;
VhostUserInflight inflight;
+   struct vhost_user_config cfg;
} payload;
/* Nothing should be added after the payload */
 } __rte_packed VhostUserMsg;
-- 
1.8.3.1



[PATCH v4 04/16] vdpa/ifc: add blk ops for ifc device

2022-03-27 Thread Andy Pei
For virtio blk device, re-use part of ifc driver ops.
Implement ifcvf_blk_get_config for virtio blk device.
Support VHOST_USER_PROTOCOL_F_CONFIG feature for virtio
blk device.

Signed-off-by: Andy Pei 
Reviewed-by: Maxime Coquelin 
---
 drivers/vdpa/ifc/base/ifcvf.h |  4 ++
 drivers/vdpa/ifc/ifcvf_vdpa.c | 85 ++-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 01522c6..769c603 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -66,6 +66,10 @@
 
 #define IFCVF_32_BIT_MASK  0x
 
+#ifndef VHOST_USER_PROTOCOL_F_CONFIG
+#define VHOST_USER_PROTOCOL_F_CONFIG   9
+#endif
+
 struct ifcvf_pci_cap {
u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */
u8 cap_next;/* Generic PCI field: next ptr. */
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index e3210a8..8ee041f 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -1088,6 +1088,10 @@ struct rte_vdpa_dev_info {
 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | \
 1ULL << VHOST_USER_PROTOCOL_F_STATUS)
+
+#define VDPA_BLK_PROTOCOL_FEATURES \
+   (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
+
 static int
 ifcvf_get_protocol_features(struct rte_vdpa_device *vdev, uint64_t *features)
 {
@@ -1200,6 +1204,85 @@ struct rte_vdpa_dev_info {
return device_id;
 }
 
+static int
+ifcvf_blk_get_config(int vid, uint8_t *config, uint32_t len)
+{
+   struct virtio_blk_config *dev_cfg;
+   struct ifcvf_internal *internal;
+   struct rte_vdpa_device *vdev;
+   struct internal_list *list;
+   uint32_t i;
+   uint64_t capacity = 0;
+   uint8_t *byte;
+
+   if (len < sizeof(struct virtio_blk_config)) {
+   DRV_LOG(ERR, "Invalid len: %u, required: %u",
+   len, (uint32_t)sizeof(struct virtio_blk_config));
+   return -1;
+   }
+
+   vdev = rte_vhost_get_vdpa_device(vid);
+   list = find_internal_resource_by_vdev(vdev);
+   if (list == NULL) {
+   DRV_LOG(ERR, "Invalid vDPA device: %p", vdev);
+   return -1;
+   }
+
+   internal = list->internal;
+
+   for (i = 0; i < sizeof(struct virtio_blk_config); i++)
+   config[i] = *((u8 *)internal->hw.blk_cfg + i);
+
+   dev_cfg = (struct virtio_blk_config *)internal->hw.blk_cfg;
+
+   /* cannot read 64-bit register in one attempt, so read byte by byte. */
+   for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
+   byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
+   capacity |= (uint64_t)*byte << (i * 8);
+   }
+   DRV_LOG(INFO, "capacity  : %"PRIu64"G", capacity >> 21);
+
+   DRV_LOG(INFO, "size_max  : 0x%08x", dev_cfg->size_max);
+   DRV_LOG(INFO, "seg_max   : 0x%08x", dev_cfg->seg_max);
+   DRV_LOG(INFO, "blk_size  : 0x%08x", dev_cfg->blk_size);
+   DRV_LOG(INFO, "geometry");
+   DRV_LOG(INFO, "  cylinders: %u", dev_cfg->geometry.cylinders);
+   DRV_LOG(INFO, "  heads: %u", dev_cfg->geometry.heads);
+   DRV_LOG(INFO, "  sectors  : %u", dev_cfg->geometry.sectors);
+   DRV_LOG(INFO, "num_queues: 0x%08x", dev_cfg->num_queues);
+
+   DRV_LOG(INFO, "config: [%x] [%x] [%x] [%x] [%x] [%x] [%x] [%x]\n",
+   config[0], config[1], config[2], config[3], config[4],
+   config[5], config[6], config[7]);
+   return 0;
+}
+
+static int
+ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
+   uint64_t *features)
+{
+   RTE_SET_USED(vdev);
+
+   *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
+   *features |= VDPA_BLK_PROTOCOL_FEATURES;
+   return 0;
+}
+
+static struct rte_vdpa_dev_ops ifcvf_blk_ops = {
+   .get_queue_num = ifcvf_get_queue_num,
+   .get_features = ifcvf_get_vdpa_features,
+   .set_features = ifcvf_set_features,
+   .get_protocol_features = ifcvf_blk_get_protocol_features,
+   .dev_conf = ifcvf_dev_config,
+   .dev_close = ifcvf_dev_close,
+   .set_vring_state = NULL,
+   .migration_done = NULL,
+   .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
+   .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
+   .get_notify_area = ifcvf_get_notify_area,
+   .get_config = ifcvf_blk_get_config,
+};
+
 struct rte_vdpa_dev_info dev_info[] = {
{
.features = (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
@@ -1212,7 +1295,7 @@ struct rte_vdpa_dev_info dev_info[] = {
{
.features = (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
(1ULL << VHOST_F_LOG_ALL),
-   .ops = NULL,
+   .ops = &ifcvf_blk_ops,
},
 };
 
-- 
1.8.3.1



[PATCH v4 05/16] vdpa/ifc: add vdpa interrupt for blk device

2022-03-27 Thread Andy Pei
For the block device type, we have to relay
the commands on all queues.

Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/ifcvf_vdpa.c | 46 ---
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 8ee041f..8d104b7 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -370,24 +370,48 @@ struct rte_vdpa_dev_info {
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
irq_set->start = 0;
fd_ptr = (int *)&irq_set->data;
+   /* The first interrupt is for the configure space change notification */
fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] =
rte_intr_fd_get(internal->pdev->intr_handle);
 
for (i = 0; i < nr_vring; i++)
internal->intr_fd[i] = -1;
 
-   for (i = 0; i < nr_vring; i++) {
-   rte_vhost_get_vhost_vring(internal->vid, i, &vring);
-   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
-   if ((i & 1) == 0 && m_rx == true) {
-   fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
-   if (fd < 0) {
-   DRV_LOG(ERR, "can't setup eventfd: %s",
-   strerror(errno));
-   return -1;
+   if (internal->device_type == IFCVF_NET) {
+   for (i = 0; i < nr_vring; i++) {
+   rte_vhost_get_vhost_vring(internal->vid, i, &vring);
+   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+   if ((i & 1) == 0 && m_rx == true) {
+   /* For the net we only need to relay rx queue,
+* which will change the mem of VM.
+*/
+   fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+   if (fd < 0) {
+   DRV_LOG(ERR, "can't setup eventfd: %s",
+   strerror(errno));
+   return -1;
+   }
+   internal->intr_fd[i] = fd;
+   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
+   }
+   }
+   } else if (internal->device_type == IFCVF_BLK) {
+   for (i = 0; i < nr_vring; i++) {
+   rte_vhost_get_vhost_vring(internal->vid, i, &vring);
+   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+   if (m_rx == true) {
+   /* For the blk we need to relay all the read cmd
+* of each queue
+*/
+   fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+   if (fd < 0) {
+   DRV_LOG(ERR, "can't setup eventfd: %s",
+   strerror(errno));
+   return -1;
+   }
+   internal->intr_fd[i] = fd;
+   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
}
-   internal->intr_fd[i] = fd;
-   fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
}
}
 
-- 
1.8.3.1



[PATCH v4 06/16] vdpa/ifc: add block device SW live-migration

2022-03-27 Thread Andy Pei
Add SW live-migration support to block device.
Add dirty page logging to block device.

Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/base/ifcvf.c |   4 +-
 drivers/vdpa/ifc/base/ifcvf.h |   6 ++
 drivers/vdpa/ifc/ifcvf_vdpa.c | 128 +++---
 3 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c
index d10c1fd..e417c50 100644
--- a/drivers/vdpa/ifc/base/ifcvf.c
+++ b/drivers/vdpa/ifc/base/ifcvf.c
@@ -191,7 +191,7 @@
IFCVF_WRITE_REG32(val >> 32, hi);
 }
 
-STATIC int
+int
 ifcvf_hw_enable(struct ifcvf_hw *hw)
 {
struct ifcvf_pci_common_cfg *cfg;
@@ -240,7 +240,7 @@
return 0;
 }
 
-STATIC void
+void
 ifcvf_hw_disable(struct ifcvf_hw *hw)
 {
u32 i;
diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 769c603..6dd7925 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -179,4 +179,10 @@ struct ifcvf_hw {
 u64
 ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid);
 
+int
+ifcvf_hw_enable(struct ifcvf_hw *hw);
+
+void
+ifcvf_hw_disable(struct ifcvf_hw *hw);
+
 #endif /* _IFCVF_H_ */
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 8d104b7..a23dc2d 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -345,6 +345,56 @@ struct rte_vdpa_dev_info {
}
 }
 
+static void
+vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal)
+{
+   struct ifcvf_hw *hw = &internal->hw;
+   struct rte_vhost_vring vq;
+   int i, vid;
+   uint64_t features = 0;
+   uint64_t log_base = 0, log_size = 0;
+   uint64_t len;
+
+   vid = internal->vid;
+
+   if (internal->device_type == IFCVF_BLK) {
+   for (i = 0; i < hw->nr_vring; i++) {
+   rte_vhost_get_vhost_vring(internal->vid, i, &vq);
+   while (vq.avail->idx != vq.used->idx) {
+   ifcvf_notify_queue(hw, i);
+   usleep(10);
+   }
+   hw->vring[i].last_avail_idx = vq.avail->idx;
+   hw->vring[i].last_used_idx = vq.used->idx;
+   }
+   }
+
+   ifcvf_hw_disable(hw);
+
+   for (i = 0; i < hw->nr_vring; i++)
+   rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
+   hw->vring[i].last_used_idx);
+
+   if (internal->sw_lm)
+   return;
+
+   rte_vhost_get_negotiated_features(vid, &features);
+   if (RTE_VHOST_NEED_LOG(features)) {
+   ifcvf_disable_logging(hw);
+   rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
+   rte_vfio_container_dma_unmap(internal->vfio_container_fd,
+   log_base, IFCVF_LOG_BASE, log_size);
+   /*
+* IFCVF marks dirty memory pages for only packet buffer,
+* SW helps to mark the used ring as dirty after device stops.
+*/
+   for (i = 0; i < hw->nr_vring; i++) {
+   len = IFCVF_USED_RING_LEN(hw->vring[i].size);
+   rte_vhost_log_used_vring(vid, i, 0, len);
+   }
+   }
+}
+
 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
 static int
@@ -659,15 +709,22 @@ struct rte_vdpa_dev_info {
}
hw->vring[i].avail = gpa;
 
-   /* Direct I/O for Tx queue, relay for Rx queue */
-   if (i & 1) {
-   gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
-   if (gpa == 0) {
-   DRV_LOG(ERR, "Fail to get GPA for used ring.");
-   return -1;
+   if (internal->device_type == IFCVF_NET) {
+   /* Direct I/O for Tx queue, relay for Rx queue */
+   if (i & 1) {
+   gpa = hva_to_gpa(vid, 
(uint64_t)(uintptr_t)vq.used);
+   if (gpa == 0) {
+   DRV_LOG(ERR, "Fail to get GPA for used 
ring.");
+   return -1;
+   }
+   hw->vring[i].used = gpa;
+   } else {
+   hw->vring[i].used = m_vring_iova +
+   (char *)internal->m_vring[i].used -
+   (char *)internal->m_vring[i].desc;
}
-   hw->vring[i].used = gpa;
-   } else {
+   } else if (internal->device_type == IFCVF_BLK) {
+   /* BLK: relay every queue */
hw->vring[i].used = m_vring_iova +
(ch

[PATCH v4 07/16] example/vdpa:add vdpa blk support in example

2022-03-27 Thread Andy Pei
Add virtio blk device support to vdpa example.

Signed-off-by: Andy Pei 
---
 examples/vdpa/main.c |  61 +-
 examples/vdpa/vdpa_blk_compact.h |  72 +
 examples/vdpa/vhost_user.h   | 169 +++
 3 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 examples/vdpa/vdpa_blk_compact.h
 create mode 100644 examples/vdpa/vhost_user.h

diff --git a/examples/vdpa/main.c b/examples/vdpa/main.c
index 5ab0765..1c809ab 100644
--- a/examples/vdpa/main.c
+++ b/examples/vdpa/main.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include "vdpa_blk_compact.h"
 
 #define MAX_PATH_LEN 128
 #define MAX_VDPA_SAMPLE_PORTS 1024
@@ -41,6 +42,7 @@ struct vdpa_port {
 static int devcnt;
 static int interactive;
 static int client_mode;
+static int isblk;
 
 /* display usage */
 static void
@@ -49,7 +51,8 @@ struct vdpa_port {
printf("Usage: %s [EAL options] -- "
 "  --interactive|-i: run in interactive 
mode.\n"
 "  --iface : specify the path prefix 
of the socket files, e.g. /tmp/vhost-user-.\n"
-"  --client: register a vhost-user socket 
as client mode.\n",
+"  --client: register a vhost-user socket 
as client mode.\n"
+"  --isblk: device is a block device, e.g. 
virtio_blk device.\n",
 prgname);
 }
 
@@ -61,6 +64,7 @@ struct vdpa_port {
{"iface", required_argument, NULL, 0},
{"interactive", no_argument, &interactive, 1},
{"client", no_argument, &client_mode, 1},
+   {"isblk", no_argument, &isblk, 1},
{NULL, 0, 0, 0},
};
int opt, idx;
@@ -159,6 +163,52 @@ struct vdpa_port {
 };
 
 static int
+vdpa_blk_device_set_features_and_protocol(const char *path)
+{
+   uint64_t protocol_features = 0;
+   int ret;
+
+   ret = rte_vhost_driver_set_features(path, VHOST_BLK_FEATURES_BASE);
+   if (ret != 0) {
+   RTE_LOG(ERR, VDPA,
+   "rte_vhost_driver_set_features for %s failed.\n",
+   path);
+   goto out;
+   }
+
+   ret = rte_vhost_driver_disable_features(path,
+   VHOST_VDPA_BLK_DISABLED_FEATURES);
+   if (ret != 0) {
+   RTE_LOG(ERR, VDPA,
+   "rte_vhost_driver_disable_features for %s failed.\n",
+   path);
+   goto out;
+   }
+
+   ret = rte_vhost_driver_get_protocol_features(path, &protocol_features);
+   if (ret != 0) {
+   RTE_LOG(ERR, VDPA,
+   "rte_vhost_driver_get_protocol_features for %s 
failed.\n",
+   path);
+   goto out;
+   }
+
+   protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
+   protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+
+   ret = rte_vhost_driver_set_protocol_features(path, protocol_features);
+   if (ret != 0) {
+   RTE_LOG(ERR, VDPA,
+   "rte_vhost_driver_set_protocol_features for %s 
failed.\n",
+   path);
+   goto out;
+   }
+
+out:
+   return ret;
+}
+
+static int
 start_vdpa(struct vdpa_port *vport)
 {
int ret;
@@ -192,6 +242,15 @@ struct vdpa_port {
"attach vdpa device failed: %s\n",
socket_path);
 
+   if (isblk) {
+   RTE_LOG(NOTICE, VDPA, "is a blk device\n");
+   ret = vdpa_blk_device_set_features_and_protocol(socket_path);
+   if (ret != 0)
+   rte_exit(EXIT_FAILURE,
+   "set vhost blk driver features and protocol 
features failed: %s\n",
+   socket_path);
+   }
+
if (rte_vhost_driver_start(socket_path) < 0)
rte_exit(EXIT_FAILURE,
"start vhost driver failed: %s\n",
diff --git a/examples/vdpa/vdpa_blk_compact.h b/examples/vdpa/vdpa_blk_compact.h
new file mode 100644
index 000..e7c0f22
--- /dev/null
+++ b/examples/vdpa/vdpa_blk_compact.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#ifndef _VDPA_BLK_COMPACT_H_
+#define _VDPA_BLK_COMPACT_H_
+
+/**
+ * @file
+ *
+ * Device specific vhost lib
+ */
+
+#include 
+
+#include 
+#include 
+
+/* Feature bits */
+#define VIRTIO_BLK_F_SIZE_MAX 1/* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX  2/* Indicates maximum # of segments */
+#define VIRTIO_BLK_F_GEOMETRY 4/* Legacy geometry available  */
+#define VIRTIO_BLK_F_RO   5/* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE 6/* Block size of disk is available */
+#define VIRTIO_BLK_F_TOPOLOGY 10   /* Top

[PATCH v4 08/16] usertools: add support for virtio blk device

2022-03-27 Thread Andy Pei
Signed-off-by: Andy Pei 
---
 usertools/dpdk-devbind.py | 8 
 1 file changed, 8 insertions(+)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index ace4627..cbe336f 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -14,6 +14,8 @@
 from os.path import join as path_join
 
 # The PCI base class for all devices
+virtio_blk_class = {'Class': '01', 'Vendor': "1af4", 'Device': '1001',
+'SVendor': '8086', 'SDevice': '0002'}
 network_class = {'Class': '02', 'Vendor': None, 'Device': None,
  'SVendor': None, 'SDevice': None}
 acceleration_class = {'Class': '12', 'Vendor': None, 'Device': None,
@@ -72,6 +74,7 @@
 cn9k_ree = {'Class': '08', 'Vendor': '177d', 'Device': 'a0f4',
  'SVendor': None, 'SDevice': None}
 
+virtio_blk_devices = [virtio_blk_class]
 network_devices = [network_class, cavium_pkx, avp_vnic, ifpga_class]
 baseband_devices = [acceleration_class]
 crypto_devices = [encryption_class, intel_processor_class]
@@ -587,6 +590,9 @@ def show_status():
 Displays to the user what devices are bound to the igb_uio driver, the
 kernel driver or to no driver'''
 
+if status_dev in ["virtio_blk", "all"]:
+show_device_status(virtio_blk_devices, "virtio_blk")
+
 if status_dev in ["net", "all"]:
 show_device_status(network_devices, "Network", if_field=True)
 
@@ -746,6 +752,7 @@ def do_arg_actions():
 if b_flag is not None:
 clear_data()
 # refresh if we have changed anything
+get_device_details(virtio_blk_devices)
 get_device_details(network_devices)
 get_device_details(baseband_devices)
 get_device_details(crypto_devices)
@@ -769,6 +776,7 @@ def main():
 parse_args()
 check_modules()
 clear_data()
+get_device_details(virtio_blk_devices)
 get_device_details(network_devices)
 get_device_details(baseband_devices)
 get_device_details(crypto_devices)
-- 
1.8.3.1



[PATCH v4 09/16] vdpa/ifc: set_vring_state op is mandatory, add set_vring_state for blk device

2022-03-27 Thread Andy Pei
Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/ifcvf_vdpa.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index a23dc2d..28191e4 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -1368,6 +1368,16 @@ struct rte_vdpa_dev_info {
 }
 
 static int
+ifcvf_blk_set_vring_state(int vid, int vring, int state)
+{
+   RTE_SET_USED(vid);
+   RTE_SET_USED(vring);
+   RTE_SET_USED(state);
+
+   return 0;
+}
+
+static int
 ifcvf_blk_get_protocol_features(struct rte_vdpa_device *vdev,
uint64_t *features)
 {
@@ -1385,7 +1395,7 @@ struct rte_vdpa_dev_info {
.get_protocol_features = ifcvf_blk_get_protocol_features,
.dev_conf = ifcvf_dev_config,
.dev_close = ifcvf_dev_close,
-   .set_vring_state = NULL,
+   .set_vring_state = ifcvf_blk_set_vring_state,
.migration_done = NULL,
.get_vfio_group_fd = ifcvf_get_vfio_group_fd,
.get_vfio_device_fd = ifcvf_get_vfio_device_fd,
-- 
1.8.3.1



[PATCH v4 10/16] vdpa/ifc: add some log at VDPA lauch before qemu connect

2022-03-27 Thread Andy Pei
Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/ifcvf_vdpa.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 28191e4..9bc2f47 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -1431,6 +1431,9 @@ struct rte_vdpa_dev_info dev_info[] = {
struct rte_kvargs *kvlist = NULL;
int ret = 0;
int16_t device_id;
+   __u64 capacity = 0;
+   uint8_t *byte;
+   uint32_t i;
 
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
return 0;
@@ -1497,6 +1500,32 @@ struct rte_vdpa_dev_info dev_info[] = {
internal->features = features &
~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
internal->features |= dev_info[IFCVF_BLK].features;
+
+   /**
+   ** cannot read 64-bit register in one attempt,
+   ** so read byte by byte.
+   **/
+   for (i = 0; i < sizeof(internal->hw.blk_cfg->capacity); i++) {
+   byte = (uint8_t *)&internal->hw.blk_cfg->capacity + i;
+   capacity |= (__u64)*byte << (i * 8);
+   }
+   DRV_LOG(INFO, "capacity  : %quG", capacity >> 21);
+
+   DRV_LOG(INFO, "size_max  : 0x%08x",
+   internal->hw.blk_cfg->size_max);
+   DRV_LOG(INFO, "seg_max   : 0x%08x",
+   internal->hw.blk_cfg->seg_max);
+   DRV_LOG(INFO, "blk_size  : 0x%08x",
+   internal->hw.blk_cfg->blk_size);
+   DRV_LOG(INFO, "geometry");
+   DRV_LOG(INFO, "cylinders: %u",
+   internal->hw.blk_cfg->geometry.cylinders);
+   DRV_LOG(INFO, "heads: %u",
+   internal->hw.blk_cfg->geometry.heads);
+   DRV_LOG(INFO, "sectors  : %u",
+   internal->hw.blk_cfg->geometry.sectors);
+   DRV_LOG(INFO, "num_queues: 0x%08x",
+   internal->hw.blk_cfg->num_queues);
}
 
list->internal = internal;
-- 
1.8.3.1



[PATCH v4 11/16] vdpa/ifc: read virtio max_queues from hardware

2022-03-27 Thread Andy Pei
original code max_queues is set to IFCVF_MAX_QUEUES.
New code max_queues is the min of IFCVF_MAX_QUEUES and hardware num_queues.

Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/ifcvf_vdpa.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 9bc2f47..20a0b01 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -1526,6 +1526,10 @@ struct rte_vdpa_dev_info dev_info[] = {
internal->hw.blk_cfg->geometry.sectors);
DRV_LOG(INFO, "num_queues: 0x%08x",
internal->hw.blk_cfg->num_queues);
+
+   /* reset max_queue here, to minimum modification */
+   internal->max_queues = RTE_MIN(IFCVF_MAX_QUEUES,
+   internal->hw.blk_cfg->num_queues);
}
 
list->internal = internal;
-- 
1.8.3.1



[PATCH v4 12/16] vdpa: add config space change interrupt register and handle for virtio_blk

2022-03-27 Thread Andy Pei
Create a thread to poll and relay config space change interrupt.
Use VHOST_USER_SLAVE_CONFIG_CHANGE_MSG to info qemu.

Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/ifcvf_vdpa.c | 113 ++
 1 file changed, 113 insertions(+)

diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 20a0b01..826b408 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -53,7 +53,9 @@ struct ifcvf_internal {
int vfio_group_fd;
int vfio_dev_fd;
pthread_t tid;  /* thread for notify relay */
+   pthread_t intr_tid; /* thread for intr relay */
int epfd;
+   int csc_fd;
int vid;
struct rte_vdpa_device *vdev;
uint16_t max_queues;
@@ -608,6 +610,108 @@ struct rte_vdpa_dev_info {
return 0;
 }
 
+static void
+virtio_interrupt_handler(struct ifcvf_internal *internal)
+{
+   int vid = internal->vid;
+   int ret;
+
+   ret = rte_vhost_slave_config_change(vid, 1);
+   if (ret)
+   DRV_LOG(ERR, "failed to notify the guest about configuration 
space change.");
+
+   return;
+}
+
+static void *
+intr_relay(void *arg)
+{
+   struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
+   struct epoll_event csc_event;
+   struct epoll_event ev;
+   uint64_t buf;
+   int nbytes;
+   int csc_fd, csc_val = 0;
+
+   csc_fd = epoll_create(1);
+   if (csc_fd < 0) {
+   DRV_LOG(ERR, "failed to create epoll for config space change.");
+   return NULL;
+   }
+
+   ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
+   ev.data.fd = rte_intr_fd_get(internal->pdev->intr_handle);
+   if (epoll_ctl(csc_fd, EPOLL_CTL_ADD,
+   rte_intr_fd_get(internal->pdev->intr_handle), &ev) < 0) {
+   DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
+   return NULL;
+   }
+
+   internal->csc_fd = csc_fd;
+
+   for (;;) {
+   csc_val = epoll_wait(csc_fd, &csc_event, 1, -1);
+   if (csc_val < 0) {
+   if (errno == EINTR)
+   continue;
+   DRV_LOG(ERR, "epoll_wait return fail\n");
+   return NULL;
+   } else if (csc_val == 0) {
+   continue;
+   } else {
+   /* csc_val > 0 */
+   nbytes = read(csc_event.data.fd, &buf, 8);
+   if (nbytes < 0) {
+   if (errno == EINTR || errno == EWOULDBLOCK)
+   continue;
+   DRV_LOG(ERR, "Error reading from file 
descriptor %d: %s\n",
+   csc_event.data.fd,
+   strerror(errno));
+   return NULL;
+   } else if (nbytes == 0) {
+   DRV_LOG(ERR, "Read nothing from file descriptor 
%d\n",
+   csc_event.data.fd);
+   continue;
+   } else {
+   virtio_interrupt_handler(internal);
+   }
+   }
+   }
+   return NULL;
+}
+
+static int
+setup_intr_relay(struct ifcvf_internal *internal)
+{
+   int ret;
+
+   ret = pthread_create(&internal->intr_tid, NULL, intr_relay,
+   (void *)internal);
+   if (ret) {
+   DRV_LOG(ERR, "failed to create notify relay pthread.");
+   return -1;
+   }
+   return 0;
+}
+
+static int
+unset_intr_relay(struct ifcvf_internal *internal)
+{
+   void *status;
+
+   if (internal->intr_tid) {
+   pthread_cancel(internal->intr_tid);
+   pthread_join(internal->intr_tid, &status);
+   }
+   internal->intr_tid = 0;
+
+   if (internal->csc_fd >= 0)
+   close(internal->csc_fd);
+   internal->csc_fd = -1;
+
+   return 0;
+}
+
 static int
 update_datapath(struct ifcvf_internal *internal)
 {
@@ -634,10 +738,16 @@ struct rte_vdpa_dev_info {
if (ret)
goto err;
 
+   ret = setup_intr_relay(internal);
+   if (ret)
+   goto err;
+
rte_atomic32_set(&internal->running, 1);
} else if (rte_atomic32_read(&internal->running) &&
   (!rte_atomic32_read(&internal->started) ||
!rte_atomic32_read(&internal->dev_attached))) {
+   ret = unset_intr_relay(internal);
+
ret = unset_notify_relay(internal);
if (ret)
goto err;
@@ -958,6 +1068,9 @@ struct rte_vdpa_dev_info {
vdpa_ifcvf_stop(internal);
else if (internal->device_type == IFCVF_BLK)
vdpa_ifcvf_blk_pause(inte

[PATCH v4 13/16] vdpa/ifc: add is_blk flag to ifcvf_hw, and init is_blk during probe

2022-03-27 Thread Andy Pei
Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/base/ifcvf.h | 1 +
 drivers/vdpa/ifc/ifcvf_vdpa.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 6dd7925..8e602af 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -149,6 +149,7 @@ struct ifcvf_hw {
u8 *lm_cfg;
struct vring_info vring[IFCVF_MAX_QUEUES * 2];
u8 nr_vring;
+   u8 is_blk;
struct ifcvf_pci_mem_resource mem_resource[IFCVF_PCI_MAX_RESOURCE];
 };
 
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 826b408..95538c1 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -1605,11 +1605,13 @@ struct rte_vdpa_dev_info dev_info[] = {
 
if (device_id == VIRTIO_ID_NET) {
internal->device_type = IFCVF_NET;
+   internal->hw.is_blk = IFCVF_NET;
internal->features = features &
~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
internal->features |= dev_info[IFCVF_NET].features;
} else if (device_id == VIRTIO_ID_BLOCK) {
internal->device_type = IFCVF_BLK;
+   internal->hw.is_blk = IFCVF_BLK;
internal->features = features &
~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
internal->features |= dev_info[IFCVF_BLK].features;
-- 
1.8.3.1



[PATCH v4 14/16] vdpa/ifc/base: for blk device, live migration register is different from net device

2022-03-27 Thread Andy Pei
1.last_avail_idx is lower 16 bit of the register.
2.address of ring_state register is different between net and blk device.

Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/base/ifcvf.c | 36 +---
 drivers/vdpa/ifc/base/ifcvf.h |  1 +
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c
index e417c50..d923266 100644
--- a/drivers/vdpa/ifc/base/ifcvf.c
+++ b/drivers/vdpa/ifc/base/ifcvf.c
@@ -218,10 +218,18 @@
&cfg->queue_used_hi);
IFCVF_WRITE_REG16(hw->vring[i].size, &cfg->queue_size);
 
-   *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
-   (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4) =
-   (u32)hw->vring[i].last_avail_idx |
-   ((u32)hw->vring[i].last_used_idx << 16);
+   if (hw->is_blk == IFCVF_BLK) {
+   *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+   i * IFCVF_LM_CFG_SIZE) =
+   (u32)hw->vring[i].last_avail_idx |
+   ((u32)hw->vring[i].last_used_idx << 16);
+   } else if (hw->is_blk == IFCVF_NET) {
+   *(u32 *)(lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
+   (i / 2) * IFCVF_LM_CFG_SIZE +
+   (i % 2) * 4) =
+   (u32)hw->vring[i].last_avail_idx |
+   ((u32)hw->vring[i].last_used_idx << 16);
+   }
 
IFCVF_WRITE_REG16(i + 1, &cfg->queue_msix_vector);
if (IFCVF_READ_REG16(&cfg->queue_msix_vector) ==
@@ -254,9 +262,23 @@
IFCVF_WRITE_REG16(i, &cfg->queue_select);
IFCVF_WRITE_REG16(0, &cfg->queue_enable);
IFCVF_WRITE_REG16(IFCVF_MSI_NO_VECTOR, &cfg->queue_msix_vector);
-   ring_state = *(u32 *)(hw->lm_cfg + IFCVF_LM_RING_STATE_OFFSET +
-   (i / 2) * IFCVF_LM_CFG_SIZE + (i % 2) * 4);
-   hw->vring[i].last_avail_idx = (u16)(ring_state >> 16);
+
+   if (hw->is_blk) {
+   ring_state = *(u32 *)(hw->lm_cfg +
+   IFCVF_LM_RING_STATE_OFFSET +
+   i * IFCVF_LM_CFG_SIZE);
+   } else if (hw->is_blk == IFCVF_NET) {
+   ring_state = *(u32 *)(hw->lm_cfg +
+   IFCVF_LM_RING_STATE_OFFSET +
+   (i / 2) * IFCVF_LM_CFG_SIZE +
+   (i % 2) * 4);
+   }
+
+   if (hw->is_blk == IFCVF_BLK)
+   hw->vring[i].last_avail_idx =
+   (u16)(ring_state & IFCVF_16_BIT_MASK);
+   else if (hw->is_blk == IFCVF_NET)
+   hw->vring[i].last_avail_idx = (u16)(ring_state >> 16);
hw->vring[i].last_used_idx = (u16)(ring_state >> 16);
}
 }
diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 8e602af..7367094 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -65,6 +65,7 @@
 #define IFCVF_MEDIATED_VRING   0x2000
 
 #define IFCVF_32_BIT_MASK  0x
+#define IFCVF_16_BIT_MASK  0x
 
 #ifndef VHOST_USER_PROTOCOL_F_CONFIG
 #define VHOST_USER_PROTOCOL_F_CONFIG   9
-- 
1.8.3.1



[PATCH v4 15/16] vdpa/ifc: make sure hardware last_avail_idx and last_used_idx is the same when blk device pause

2022-03-27 Thread Andy Pei
Signed-off-by: Andy Pei 
---
 drivers/vdpa/ifc/base/ifcvf.c |  2 +-
 drivers/vdpa/ifc/base/ifcvf.h |  3 +++
 drivers/vdpa/ifc/ifcvf_vdpa.c | 32 +++-
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c
index d923266..d89cb73 100644
--- a/drivers/vdpa/ifc/base/ifcvf.c
+++ b/drivers/vdpa/ifc/base/ifcvf.c
@@ -118,7 +118,7 @@
IFCVF_WRITE_REG8(status, &hw->common_cfg->device_status);
 }
 
-STATIC void
+void
 ifcvf_reset(struct ifcvf_hw *hw)
 {
ifcvf_set_status(hw, 0);
diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h
index 7367094..f22d18b 100644
--- a/drivers/vdpa/ifc/base/ifcvf.h
+++ b/drivers/vdpa/ifc/base/ifcvf.h
@@ -157,6 +157,9 @@ struct ifcvf_hw {
 int
 ifcvf_init_hw(struct ifcvf_hw *hw, PCI_DEV *dev);
 
+void
+ifcvf_reset(struct ifcvf_hw *hw);
+
 u64
 ifcvf_get_features(struct ifcvf_hw *hw);
 
diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c
index 95538c1..36fd850 100644
--- a/drivers/vdpa/ifc/ifcvf_vdpa.c
+++ b/drivers/vdpa/ifc/ifcvf_vdpa.c
@@ -351,23 +351,32 @@ struct rte_vdpa_dev_info {
 vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal)
 {
struct ifcvf_hw *hw = &internal->hw;
-   struct rte_vhost_vring vq;
int i, vid;
uint64_t features = 0;
uint64_t log_base = 0, log_size = 0;
uint64_t len;
+   u32 ring_state = 0;
 
vid = internal->vid;
 
if (internal->device_type == IFCVF_BLK) {
for (i = 0; i < hw->nr_vring; i++) {
-   rte_vhost_get_vhost_vring(internal->vid, i, &vq);
-   while (vq.avail->idx != vq.used->idx) {
-   ifcvf_notify_queue(hw, i);
-   usleep(10);
-   }
-   hw->vring[i].last_avail_idx = vq.avail->idx;
-   hw->vring[i].last_used_idx = vq.used->idx;
+   do {
+   if (hw->lm_cfg != NULL)
+   ring_state = *(u32 *)(hw->lm_cfg +
+   IFCVF_LM_RING_STATE_OFFSET +
+   i * IFCVF_LM_CFG_SIZE);
+   hw->vring[i].last_avail_idx =
+   (u16)(ring_state & IFCVF_16_BIT_MASK);
+   hw->vring[i].last_used_idx =
+   (u16)(ring_state >> 16);
+   if (hw->vring[i].last_avail_idx !=
+   hw->vring[i].last_used_idx) {
+   ifcvf_notify_queue(hw, i);
+   usleep(10);
+   }
+   } while (hw->vring[i].last_avail_idx !=
+   hw->vring[i].last_used_idx);
}
}
 
@@ -752,7 +761,12 @@ struct rte_vdpa_dev_info {
if (ret)
goto err;
 
-   vdpa_ifcvf_stop(internal);
+   if (internal->device_type == IFCVF_BLK) {
+   vdpa_ifcvf_blk_pause(internal);
+   ifcvf_reset(&internal->hw);
+   } else {
+   vdpa_ifcvf_stop(internal);
+   }
 
ret = vdpa_disable_vfio_intr(internal);
if (ret)
-- 
1.8.3.1



[PATCH v4 16/16] vhost: make sure each queue callfd is configured

2022-03-27 Thread Andy Pei
During the vhost data path building process, qemu will create
a call fd at first, and create another call fd in the end.
The final call fd will be used to relay notify.
In the original code, after kick fd is set, dev_conf will
set the first call fd. Even though the actual call fd will set,
the data path will not work correctly.

Signed-off-by: Andy Pei 
---
 lib/vhost/vhost_user.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
index b11fafd..8c5904f 100644
--- a/lib/vhost/vhost_user.c
+++ b/lib/vhost/vhost_user.c
@@ -3197,12 +3197,27 @@ typedef int (*vhost_message_handler_t)(struct 
virtio_net **pdev,
if (!vdpa_dev)
goto out;
 
+   if (request != VHOST_USER_SET_VRING_CALL)
+   goto out;
+
if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
if (vdpa_dev->ops->dev_conf(dev->vid))
VHOST_LOG_CONFIG(ERR, "(%s) failed to configure vDPA 
device\n",
dev->ifname);
else
dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
+   } else {
+   /**
+   ** VIRTIO_DEV_VDPA_CONFIGURED already configured
+   ** close the device and open the device again,
+   ** make sure the call fd of each queue is configed to haedware.
+   **/
+   if (vdpa_dev->ops->dev_close(dev->vid))
+   VHOST_LOG_CONFIG(ERR,
+"Failed to close vDPA device\n");
+   if (vdpa_dev->ops->dev_conf(dev->vid))
+   VHOST_LOG_CONFIG(ERR,
+"Failed to re-config vDPA device\n");
}
 
 out:
-- 
1.8.3.1



Re: [RFC] eal: add seqlock

2022-03-27 Thread Mattias Rönnblom
On 2022-03-27 16:49, Ananyev, Konstantin wrote:
>> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
>> index 9700494816..48df5f1a21 100644
>> --- a/lib/eal/include/meson.build
>> +++ b/lib/eal/include/meson.build
>> @@ -36,6 +36,7 @@ headers += files(
>>   'rte_per_lcore.h',
>>   'rte_random.h',
>>   'rte_reciprocal.h',
>> +'rte_seqlock.h',
>>   'rte_service.h',
>>   'rte_service_component.h',
>>   'rte_string_fns.h',
>> diff --git a/lib/eal/include/rte_seqlock.h b/lib/eal/include/rte_seqlock.h
>> new file mode 100644
>> index 00..b975ca848a
>> --- /dev/null
>> +++ b/lib/eal/include/rte_seqlock.h
>> @@ -0,0 +1,84 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2022 Ericsson AB
>> + */
>> +
>> +#ifndef _RTE_SEQLOCK_H_
>> +#define _RTE_SEQLOCK_H_
>> +
>> +#include 
>> +#include 
>> +
>> +#include 
>> +#include 
>> +#include 
>> +
>> +struct rte_seqlock {
>> +uint64_t sn;
>> +rte_spinlock_t lock;
>> +};
>> +
>> +typedef struct rte_seqlock rte_seqlock_t;
>> +
>> +__rte_experimental
>> +void
>> +rte_seqlock_init(rte_seqlock_t *seqlock);
> Probably worth to have static initializer too.
>

I will add that in the next version, thanks.

>> +
>> +__rte_experimental
>> +static inline uint64_t
>> +rte_seqlock_read_begin(const rte_seqlock_t *seqlock)
>> +{
>> +/* __ATOMIC_ACQUIRE to prevent loads after (in program order)
>> + * from happening before the sn load. Syncronizes-with the
>> + * store release in rte_seqlock_end().
>> + */
>> +return __atomic_load_n(&seqlock->sn, __ATOMIC_ACQUIRE);
>> +}
>> +
>> +__rte_experimental
>> +static inline bool
>> +rte_seqlock_read_retry(const rte_seqlock_t *seqlock, uint64_t begin_sn)
>> +{
>> +uint64_t end_sn;
>> +
>> +/* make sure the data loads happens before the sn load */
>> +rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
> That's sort of 'read_end' correct?
> If so, shouldn't it be '__ATOMIC_RELEASE' instead here,
> and
> end_sn = __atomic_load_n(..., (__ATOMIC_ACQUIRE)
> on the line below?

A release fence prevents reordering of stores. The reader doesn't do any 
stores, so I don't understand why you would use a release fence here. 
Could you elaborate?

>> +
>> +end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED);
>> +
>> +return unlikely(begin_sn & 1 || begin_sn != end_sn);
>> +}
>> +
>> +__rte_experimental
>> +static inline void
>> +rte_seqlock_write_begin(rte_seqlock_t *seqlock)
>> +{
>> +uint64_t sn;
>> +
>> +/* to synchronize with other writers */
>> +rte_spinlock_lock(&seqlock->lock);
>> +
>> +sn = seqlock->sn + 1;
>> +
>> +__atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELAXED);
>> +
>> +/* __ATOMIC_RELEASE to prevent stores after (in program order)
>> + * from happening before the sn store.
>> + */
>> +rte_atomic_thread_fence(__ATOMIC_RELEASE);
> I think it needs to be '__ATOMIC_ACQUIRE' here instead of '__ATOMIC_RELEASE'.

Please elaborate on why.

>> +}
>> +
>> +__rte_experimental
>> +static inline void
>> +rte_seqlock_write_end(rte_seqlock_t *seqlock)
>> +{
>> +uint64_t sn;
>> +
>> +sn = seqlock->sn + 1;
>> +
>> +/* synchronizes-with the load acquire in rte_seqlock_begin() */
>> +__atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELEASE);
>> +
>> +rte_spinlock_unlock(&seqlock->lock);
>> +}
>> +



Re: [PATCH v4 08/16] usertools: add support for virtio blk device

2022-03-27 Thread Stephen Hemminger
On Sun, 27 Mar 2022 22:51:31 +0800
Andy Pei  wrote:

> Signed-off-by: Andy Pei 

Shouldn't we just recommend driverctl instead?

I had patches for devbind to use vmbus rejected because of that.




[PATCH v2] net/ice: fix raw flow input pattern value change in FDIR

2022-03-27 Thread Ting Xu
When parsing raw flow pattern in FDIR, the input parameter spec and
mask are used directly and the original value will be changed. It
will cause error if these values are used in other functions. In this
patch, temporary variables are created to store the spec and mask.

Fixes: 25be39cc1760 ("net/ice: enable protocol agnostic flow offloading in 
FDIR")

Cc: sta...@dpdk.org

Signed-off-by: Ting Xu 
---
 drivers/net/ice/ice_fdir_filter.c | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ice/ice_fdir_filter.c 
b/drivers/net/ice/ice_fdir_filter.c
index 7954c6d8ea..5ff1afac90 100644
--- a/drivers/net/ice/ice_fdir_filter.c
+++ b/drivers/net/ice/ice_fdir_filter.c
@@ -1868,10 +1868,11 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
break;
 
/* convert raw spec & mask from byte string to int */
-   unsigned char *tmp_spec =
+   unsigned char *spec_pattern =
(uint8_t *)(uintptr_t)raw_spec->pattern;
-   unsigned char *tmp_mask =
+   unsigned char *mask_pattern =
(uint8_t *)(uintptr_t)raw_mask->pattern;
+   uint8_t *tmp_spec, *tmp_mask;
uint16_t udp_port = 0;
uint16_t tmp_val = 0;
uint8_t pkt_len = 0;
@@ -1883,8 +1884,18 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
pkt_len)
return -rte_errno;
 
+   tmp_spec = rte_zmalloc(NULL, pkt_len / 2, 0);
+   if (!tmp_spec)
+   return -rte_errno;
+
+   tmp_mask = rte_zmalloc(NULL, pkt_len / 2, 0);
+   if (!tmp_mask) {
+   rte_free(tmp_spec);
+   return -rte_errno;
+   }
+
for (i = 0, j = 0; i < pkt_len; i += 2, j++) {
-   tmp = tmp_spec[i];
+   tmp = spec_pattern[i];
if (tmp >= 'a' && tmp <= 'f')
tmp_val = tmp - 'a' + 10;
if (tmp >= 'A' && tmp <= 'F')
@@ -1893,7 +1904,7 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
tmp_val = tmp - '0';
 
tmp_val *= 16;
-   tmp = tmp_spec[i + 1];
+   tmp = spec_pattern[i + 1];
if (tmp >= 'a' && tmp <= 'f')
tmp_spec[j] = tmp_val + tmp - 'a' + 10;
if (tmp >= 'A' && tmp <= 'F')
@@ -1901,7 +1912,7 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
if (tmp >= '0' && tmp <= '9')
tmp_spec[j] = tmp_val + tmp - '0';
 
-   tmp = tmp_mask[i];
+   tmp = mask_pattern[i];
if (tmp >= 'a' && tmp <= 'f')
tmp_val = tmp - 'a' + 10;
if (tmp >= 'A' && tmp <= 'F')
@@ -1910,7 +1921,7 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
tmp_val = tmp - '0';
 
tmp_val *= 16;
-   tmp = tmp_mask[i + 1];
+   tmp = mask_pattern[i + 1];
if (tmp >= 'a' && tmp <= 'f')
tmp_mask[j] = tmp_val + tmp - 'a' + 10;
if (tmp >= 'A' && tmp <= 'F')
@@ -1953,6 +1964,8 @@ ice_fdir_parse_pattern(__rte_unused struct ice_adapter 
*ad,
 
filter->parser_ena = true;
 
+   rte_free(tmp_spec);
+   rte_free(tmp_mask);
break;
}
 
-- 
2.17.1



[PATCH] vhost: fix null pointer dereference

2022-03-27 Thread Jiayu Hu
NULL check for vq->async must be protected by lock. Otherwise, it is
possible that the data plane thread dereferences vq->async with NULL
value, since the control plane thread is freeing vq->async.

Fixes: ee8024b3d4ad (vhost: move async data in dedicated structure)
Cc: sta...@dpdk.org

Signed-off-by: Jiayu Hu 
---
 lib/vhost/vhost.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index bc88148347..7f60c2824f 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -1887,9 +1887,6 @@ rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
if (vq == NULL)
return ret;
 
-   if (!vq->async)
-   return ret;
-
if (!rte_spinlock_trylock(&vq->access_lock)) {
VHOST_LOG_CONFIG(DEBUG,
"(%s) failed to check in-flight packets. virtqueue 
busy.\n",
@@ -1897,6 +1894,9 @@ rte_vhost_async_get_inflight(int vid, uint16_t queue_id)
return ret;
}
 
+   if (!vq->async)
+   return ret;
+
ret = vq->async->pkts_inflight_n;
rte_spinlock_unlock(&vq->access_lock);
 
-- 
2.25.1



[Bug 944] [dpdk 22.03-rc2] inline_ipsec/test_Ipsec_Encryption_Rss: run dpdk-ipsec-secgw with assigned parameter fail

2022-03-27 Thread bugzilla
https://bugs.dpdk.org/show_bug.cgi?id=944

jiang,yu (yux.ji...@intel.com) changed:

   What|Removed |Added

 Resolution|--- |INVALID
 CC||yux.ji...@intel.com
 Status|UNCONFIRMED |RESOLVED

--- Comment #2 from jiang,yu (yux.ji...@intel.com) ---
Thanks, we will modify config to test.

-- 
You are receiving this mail because:
You are the assignee for the bug.

RE: [PATCH] usertools: enhance logic to display NUMA

2022-03-27 Thread Varghese, Vipin
[AMD Official Use Only]

Hi Thomas,

 

26/03/2022 08:32, Vipin Varghese:
> enhance python logic to accomadate NUMA information. Current logic 
> considers physical socket with CPU threads to core map. With new AMD 
> SKU vairant NUMA is no longer same as SOCKET. Single physical socket 
> can be partitioned to variant of 1,2 and 4.
>
> The changes address the new mapping with Socket-NUMA to CPU cores.
>
> Signed-off-by: Vipin Varghese 
> ---
>  usertools/cpu_layout.py | 76 
> +
>  1 file changed, 47 insertions(+), 29 deletions(-)

Honestly, I'm not sure it is a good idea to keep this script in the DPDK repo.
Can it be replaced with hwloc usage?

thanks for the suggestion, it is genuine and useful.   Following is my 
observations

- It takes some effort to identify the NUMA with `Group`
 - One needs to install ` lstopo-no-graphics` on distro and manually build and 
add on custom Linux.


What is the output on the new AMD SKU for this command?
lstopo-no-graphics --merge

I have tried ` lstopo-no-graphics --merge` on a ` 2 Socket AMD EPYC 7713 
64-Core Processor` with
possible NUMA configuration such as 1, 2 and 4. 

```
$ lstopo-no-graphics --merge
Machine (503GB total)
  Package L#0
Group0 L#0
  NUMANode L#0 (P#0 126GB)
  L3 L#0 (32MB)
Core L#0
  PU L#0 (P#0)
  PU L#1 (P#128)
Core L#1
  PU L#2 (P#1)
  PU L#3 (P#129)
Core L#2
  PU L#4 (P#2)
  PU L#5 (P#130)
Core L#3
  PU L#6 (P#3)
  PU L#7 (P#131)
Core L#4
  PU L#8 (P#4)
  PU L#9 (P#132)
Core L#5
  PU L#10 (P#5)
  PU L#11 (P#133)
Core L#6
  PU L#12 (P#6)
  PU L#13 (P#134)
Core L#7
  PU L#14 (P#7)
  PU L#15 (P#135)
  L3 L#1 (32MB)
Core L#8
  PU L#16 (P#8)
  PU L#17 (P#136)
Core L#9
  PU L#18 (P#9)
  PU L#19 (P#137)
Core L#10
  PU L#20 (P#10)
  PU L#21 (P#138)
Core L#11
  PU L#22 (P#11)
  PU L#23 (P#139)
Core L#12
  PU L#24 (P#12)
  PU L#25 (P#140)
Core L#13
  PU L#26 (P#13)
  PU L#27 (P#141)
Core L#14
  PU L#28 (P#14)
  PU L#29 (P#142)
Core L#15
  PU L#30 (P#15)
  PU L#31 (P#143)
  L3 L#2 (32MB)
Core L#16
  PU L#32 (P#16)
  PU L#33 (P#144)
Core L#17
  PU L#34 (P#17)
  PU L#35 (P#145)
Core L#18
  PU L#36 (P#18)
  PU L#37 (P#146)
Core L#19
  PU L#38 (P#19)
  PU L#39 (P#147)
Core L#20
  PU L#40 (P#20)
  PU L#41 (P#148)
Core L#21
  PU L#42 (P#21)
  PU L#43 (P#149)
Core L#22
  PU L#44 (P#22)
  PU L#45 (P#150)
Core L#23
  PU L#46 (P#23)
  PU L#47 (P#151)
  L3 L#3 (32MB)
Core L#24
  PU L#48 (P#24)
  PU L#49 (P#152)
Core L#25
  PU L#50 (P#25)
  PU L#51 (P#153)
Core L#26
  PU L#52 (P#26)
  PU L#53 (P#154)
Core L#27
  PU L#54 (P#27)
  PU L#55 (P#155)
Core L#28
  PU L#56 (P#28)
  PU L#57 (P#156)
Core L#29
  PU L#58 (P#29)
  PU L#59 (P#157)
Core L#30
  PU L#60 (P#30)
  PU L#61 (P#158)
Core L#31
  PU L#62 (P#31)
  PU L#63 (P#159)
  HostBridge
PCIBridge
  PCI 41:00.0 (Ethernet)
Net "ens1f0"
OpenFabrics "mlx5_0"
  PCI 41:00.1 (Ethernet)
Net "ens1f1"
OpenFabrics "mlx5_1"
  HostBridge
PCIBridge
  PCI 63:00.0 (Ethernet)
Net "eno12399np0"
  PCI 63:00.1 (Ethernet)
Net "eno12409np1"
PCIBridge
  PCIBridge
PCI 62:00.0 (VGA)
Group0 L#1
  NUMANode L#1 (P#1 126GB)
  L3 L#4 (32MB)
Core L#32
  PU L#64 (P#32)
  PU L#65 (P#160)
Core L#33
  PU L#66 (P#33)
  PU L#67 (P#161)
Core L#34
  PU L#68 (P#34)
  PU L#69 (P#162)
Core L#35
  PU L#70 (P#35)
  PU L#71 (P#163)
Core L#36
  PU L#72 (P#36)
  PU L#73 (P#164)
Core L#37
  PU L#74 (P#37)
  PU L#75 (P#165)
Core L#38
  PU L#76 (P#38)
  PU L#77 (P#166)
Core L#39
  PU L#78 (P#39)
  PU L#79 (P#167)
  L3 L#5 (32MB)
Core L#40
  PU L#80 (P#40)
  PU L#81 (P#168)
Core L#41
  PU L#82 (P#41)
  PU L#83 (P#169)
Core L#42
  PU L#84 (P#42)
  PU L#85 (P#170)
Core L#43
  PU L#86 (P#43)
  PU L#87 (P#171)
Core L#44
  PU L#88 (P#44)
  PU L#89 (P#172)
Core L#45

RE: [PATCH] meson: update doc logic for Windows

2022-03-27 Thread Varghese, Vipin
[AMD Official Use Only]

Hi Thomas,



Thank you for looking at this problem.

26/03/2022 03:59, Vipin Varghese:
> Support for shell scripts doxy-html-custom, generate_doxygen and 
> generate_examples are absent. The current patch address the same by 
> disabling document build notifying the user.

It should not prevent generating guides with sphinx.

We did get error from `doc/meson` stating ` echo command not available on 
windows` for both cmd and powershell for the line
```
run_target('doc', command: [echo, message, doc_target_names],
depends: doc_targets)
```

> Steps to reproduce the error:
>  - Install dependencies doxygen & sphinix build on Windwos server 2019.
>  - Build DPDK with option enable_docs=true for API or User Guide.
>
> This produces error
> ```
> FAILED: doc/api/examples.dox
> sh -e dpdk/doc/api/generate_examples.sh dpdk/examples 
> doc/api/examples.dox ```

I suppose we could replace shell scripts with Python equivalent.

I am trying to minimize the changes as first step: fix the build error by 
disabling on windows`.
Next step we can convert to python to make it cross platform independent.


RE: [RFC v2 1/2] vhost: support clear in-flight packets for async dequeue

2022-03-27 Thread Pai G, Sunil
Hi Yuan,

Thanks for the patch, comment inline.

> +uint16_t
> +rte_vhost_clear_queue(int vid, uint16_t queue_id, struct rte_mbuf **pkts,
> + uint16_t count, int16_t dma_id, uint16_t vchan_id) {
> + struct virtio_net *dev = get_device(vid);
> + struct vhost_virtqueue *vq;
> + uint16_t n_pkts_cpl = 0;
> +
> + if (!dev)
> + return 0;
> +
> + VHOST_LOG_DATA(DEBUG, "(%s) %s\n", dev->ifname, __func__);
> + if (unlikely(queue_id >= dev->nr_vring)) {
> + VHOST_LOG_DATA(ERR, "(%s) %s: invalid virtqueue idx %d.\n",
> + dev->ifname, __func__, queue_id);
> + return 0;
> + }
> +
> + vq = dev->virtqueue[queue_id];
> +

I think the following checks must be protected by spinlock.
Similar to : 
https://patches.dpdk.org/project/dpdk/patch/20220328020754.1155063-1-jiayu...@intel.com/
 

> + if (unlikely(!vq->async)) {
> + VHOST_LOG_DATA(ERR, "(%s) %s: async not registered for queue
> id %d.\n",
> + dev->ifname, __func__, queue_id);
> + return 0;
> + }
> +
> + if (unlikely(!dma_copy_track[dma_id].vchans ||
> +
>   !dma_copy_track[dma_id].vchans[vchan_id].pkts_cmpl_flag_addr)) {
> + VHOST_LOG_DATA(ERR, "(%s) %s: invalid channel %d:%u.\n", dev-
> >ifname, __func__,
> + dma_id, vchan_id);
> + return 0;
> + }
> +
> + if (!rte_spinlock_trylock(&vq->access_lock)) {
> + VHOST_LOG_DATA(ERR,
> + "(%d) %s: failed to clear async queue id %d, virtqueue
> busy.\n",
> + dev->vid, __func__, queue_id);
> + return 0;
> + }
> +



Thanks and regards,
Sunil


[PATCH 4/4] net/cnxk: add barrier after meta batch free in scalar

2022-03-27 Thread Nithin Dabilpuram
Add barrier after meta batch free in scalar routine when
lmt lines are exactly full to make sure that next LMT line user
in Tx only starts writing the lines only when previous stoerl's
are complete.

Fixes: 4382a7ccf781 ("net/cnxk: support Rx security offload on cn10k")
Cc: sta...@dpdk.org

Signed-off-by: Nithin Dabilpuram 
---
 drivers/net/cnxk/cn10k_rx.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index e4f5a55..94c1f1e 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -1007,10 +1007,11 @@ cn10k_nix_recv_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts, uint16_t pkts,
plt_write64((wdata | nb_pkts), rxq->cq_door);
 
/* Free remaining meta buffers if any */
-   if (flags & NIX_RX_OFFLOAD_SECURITY_F && loff) {
+   if (flags & NIX_RX_OFFLOAD_SECURITY_F && loff)
nix_sec_flush_meta(laddr, lmt_id + lnum, loff, aura_handle);
-   plt_io_wmb();
-   }
+
+   if (flags & NIX_RX_OFFLOAD_SECURITY_F)
+   rte_io_wmb();
 
return nb_pkts;
 }
-- 
2.8.4



[PATCH 1/4] common/cnxk: use aggregate level rr prio from mbox

2022-03-27 Thread Nithin Dabilpuram
Use aggregate level Round Robin Priority from mbox response instead of
fixing it to single macro. This is useful when kernel AF driver
changes the constant.

Signed-off-by: Nithin Dabilpuram 
---
 drivers/common/cnxk/roc_nix_priv.h | 5 +++--
 drivers/common/cnxk/roc_nix_tm.c   | 3 ++-
 drivers/common/cnxk/roc_nix_tm_utils.c | 8 
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/common/cnxk/roc_nix_priv.h 
b/drivers/common/cnxk/roc_nix_priv.h
index 9b9ffae..cc69d71 100644
--- a/drivers/common/cnxk/roc_nix_priv.h
+++ b/drivers/common/cnxk/roc_nix_priv.h
@@ -181,6 +181,7 @@ struct nix {
uint16_t tm_root_lvl;
uint16_t tm_flags;
uint16_t tm_link_cfg_lvl;
+   uint8_t tm_aggr_lvl_rr_prio;
uint16_t contig_rsvd[NIX_TXSCH_LVL_CNT];
uint16_t discontig_rsvd[NIX_TXSCH_LVL_CNT];
uint64_t tm_markfmt_en;
@@ -284,7 +285,6 @@ void nix_unregister_irqs(struct nix *nix);
 
 /* Default TL1 priority and Quantum from AF */
 #define NIX_TM_TL1_DFLT_RR_QTM ((1 << 24) - 1)
-#define NIX_TM_TL1_DFLT_RR_PRIO 1
 
 struct nix_tm_shaper_data {
uint64_t burst_exponent;
@@ -432,7 +432,8 @@ bool nix_tm_child_res_valid(struct nix_tm_node_list *list,
struct nix_tm_node *parent);
 uint16_t nix_tm_resource_estimate(struct nix *nix, uint16_t *schq_contig,
  uint16_t *schq, enum roc_nix_tm_tree tree);
-uint8_t nix_tm_tl1_default_prep(uint32_t schq, volatile uint64_t *reg,
+uint8_t nix_tm_tl1_default_prep(struct nix *nix, uint32_t schq,
+   volatile uint64_t *reg,
volatile uint64_t *regval);
 uint8_t nix_tm_topology_reg_prep(struct nix *nix, struct nix_tm_node *node,
 volatile uint64_t *reg,
diff --git a/drivers/common/cnxk/roc_nix_tm.c b/drivers/common/cnxk/roc_nix_tm.c
index 5b70c7b..84815d0 100644
--- a/drivers/common/cnxk/roc_nix_tm.c
+++ b/drivers/common/cnxk/roc_nix_tm.c
@@ -55,7 +55,7 @@ nix_tm_node_reg_conf(struct nix *nix, struct nix_tm_node 
*node)
req = mbox_alloc_msg_nix_txschq_cfg(mbox);
req->lvl = NIX_TXSCH_LVL_TL1;
 
-   k = nix_tm_tl1_default_prep(node->parent_hw_id, req->reg,
+   k = nix_tm_tl1_default_prep(nix, node->parent_hw_id, req->reg,
req->regval);
req->num_regs = k;
rc = mbox_process(mbox);
@@ -1281,6 +1281,7 @@ nix_tm_alloc_txschq(struct nix *nix, enum roc_nix_tm_tree 
tree)
} while (pend);
 
nix->tm_link_cfg_lvl = rsp->link_cfg_lvl;
+   nix->tm_aggr_lvl_rr_prio = rsp->aggr_lvl_rr_prio;
return 0;
 alloc_err:
for (i = 0; i < NIX_TXSCH_LVL_CNT; i++) {
diff --git a/drivers/common/cnxk/roc_nix_tm_utils.c 
b/drivers/common/cnxk/roc_nix_tm_utils.c
index bcdf990..b9b605f 100644
--- a/drivers/common/cnxk/roc_nix_tm_utils.c
+++ b/drivers/common/cnxk/roc_nix_tm_utils.c
@@ -478,7 +478,7 @@ nix_tm_child_res_valid(struct nix_tm_node_list *list,
 }
 
 uint8_t
-nix_tm_tl1_default_prep(uint32_t schq, volatile uint64_t *reg,
+nix_tm_tl1_default_prep(struct nix *nix, uint32_t schq, volatile uint64_t *reg,
volatile uint64_t *regval)
 {
uint8_t k = 0;
@@ -496,7 +496,7 @@ nix_tm_tl1_default_prep(uint32_t schq, volatile uint64_t 
*reg,
k++;
 
reg[k] = NIX_AF_TL1X_TOPOLOGY(schq);
-   regval[k] = (NIX_TM_TL1_DFLT_RR_PRIO << 1);
+   regval[k] = (nix->tm_aggr_lvl_rr_prio << 1);
k++;
 
reg[k] = NIX_AF_TL1X_CIR(schq);
@@ -540,7 +540,7 @@ nix_tm_topology_reg_prep(struct nix *nix, struct 
nix_tm_node *node,
 * Static Priority is disabled
 */
if (hw_lvl == NIX_TXSCH_LVL_TL1 && nix->tm_flags & NIX_TM_TL1_NO_SP) {
-   rr_prio = NIX_TM_TL1_DFLT_RR_PRIO;
+   rr_prio = nix->tm_aggr_lvl_rr_prio;
child = 0;
}
 
@@ -662,7 +662,7 @@ nix_tm_sched_reg_prep(struct nix *nix, struct nix_tm_node 
*node,
 */
if (hw_lvl == NIX_TXSCH_LVL_TL2 &&
(!nix_tm_have_tl1_access(nix) || nix->tm_flags & NIX_TM_TL1_NO_SP))
-   strict_prio = NIX_TM_TL1_DFLT_RR_PRIO;
+   strict_prio = nix->tm_aggr_lvl_rr_prio;
 
plt_tm_dbg("Schedule config node %s(%u) lvl %u id %u, "
   "prio 0x%" PRIx64 ", rr_quantum/rr_wt 0x%" PRIx64 " (%p)",
-- 
2.8.4



[PATCH 2/4] net/cnxk: support loopback mode on AF VF's

2022-03-27 Thread Nithin Dabilpuram
Support internal loopback mode on AF VF's using RoC by setting
Tx channel same as Rx channel.

Signed-off-by: Nithin Dabilpuram 
---
 drivers/net/cnxk/cnxk_ethdev.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index 1fa4131..7f8479a 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -1116,6 +1116,9 @@ cnxk_nix_configure(struct rte_eth_dev *eth_dev)
nb_rxq = RTE_MAX(data->nb_rx_queues, 1);
nb_txq = RTE_MAX(data->nb_tx_queues, 1);
 
+   if (roc_nix_is_lbk(nix))
+   nix->enable_loop = eth_dev->data->dev_conf.lpbk_mode;
+
/* Alloc a nix lf */
rc = roc_nix_lf_alloc(nix, nb_rxq, nb_txq, rx_cfg);
if (rc) {
@@ -1239,6 +1242,9 @@ cnxk_nix_configure(struct rte_eth_dev *eth_dev)
}
}
 
+   if (roc_nix_is_lbk(nix))
+   goto skip_lbk_setup;
+
/* Configure loop back mode */
rc = roc_nix_mac_loopback_enable(nix,
 eth_dev->data->dev_conf.lpbk_mode);
@@ -1247,6 +1253,7 @@ cnxk_nix_configure(struct rte_eth_dev *eth_dev)
goto cq_fini;
}
 
+skip_lbk_setup:
/* Setup Inline security support */
rc = nix_security_setup(dev);
if (rc)
-- 
2.8.4



[PATCH 3/4] net/cnxk: update LBK ethdev link info

2022-03-27 Thread Nithin Dabilpuram
Update link info of LBK ethdev i.e AF's VF's as always up
and 100G. This is because there is no phy for the LBK interfaces
and we won't get a link update notification for the same.

Signed-off-by: Nithin Dabilpuram 
---
 drivers/net/cnxk/cnxk_link.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/cnxk/cnxk_link.c b/drivers/net/cnxk/cnxk_link.c
index f10a502..b1d59e3 100644
--- a/drivers/net/cnxk/cnxk_link.c
+++ b/drivers/net/cnxk/cnxk_link.c
@@ -12,6 +12,17 @@ cnxk_nix_toggle_flag_link_cfg(struct cnxk_eth_dev *dev, bool 
set)
else
dev->flags &= ~CNXK_LINK_CFG_IN_PROGRESS_F;
 
+   /* Update link info for LBK */
+   if (!set && roc_nix_is_lbk(&dev->nix)) {
+   struct rte_eth_link link;
+
+   link.link_status = RTE_ETH_LINK_UP;
+   link.link_speed = RTE_ETH_SPEED_NUM_100G;
+   link.link_autoneg = RTE_ETH_LINK_FIXED;
+   link.link_duplex = RTE_ETH_LINK_FULL_DUPLEX;
+   rte_eth_linkstatus_set(dev->eth_dev, &link);
+   }
+
rte_wmb();
 }
 
-- 
2.8.4



[PATCH 4/4] net/cnxk: add barrier after meta batch free in scalar

2022-03-27 Thread Nithin Dabilpuram
Add barrier after meta batch free in scalar routine when
lmt lines are exactly full to make sure that next LMT line user
in Tx only starts writing the lines only when previous stoerl's
are complete.

Fixes: 4382a7ccf781 ("net/cnxk: support Rx security offload on cn10k")
Cc: sta...@dpdk.org

Signed-off-by: Nithin Dabilpuram 
---
 drivers/net/cnxk/cn10k_rx.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index e4f5a55..94c1f1e 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -1007,10 +1007,11 @@ cn10k_nix_recv_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts, uint16_t pkts,
plt_write64((wdata | nb_pkts), rxq->cq_door);
 
/* Free remaining meta buffers if any */
-   if (flags & NIX_RX_OFFLOAD_SECURITY_F && loff) {
+   if (flags & NIX_RX_OFFLOAD_SECURITY_F && loff)
nix_sec_flush_meta(laddr, lmt_id + lnum, loff, aura_handle);
-   plt_io_wmb();
-   }
+
+   if (flags & NIX_RX_OFFLOAD_SECURITY_F)
+   rte_io_wmb();
 
return nb_pkts;
 }
-- 
2.8.4