Re: [dpdk-dev] [PATCH v6 7/7] vhost: convert inflight data to DPDK allocation API
Hi Maxime, > -Original Message- > From: Maxime Coquelin > Sent: Friday, June 18, 2021 10:04 PM > To: dev@dpdk.org; david.march...@redhat.com; Xia, Chenbo > > Cc: Maxime Coquelin > Subject: [PATCH v6 7/7] vhost: convert inflight data to DPDK allocation API > > Inflight metadata are allocated using glibc's calloc. > This patch converts them to rte_zmalloc_socket to take > care of the NUMA affinity. > > Signed-off-by: Maxime Coquelin > --- > lib/vhost/vhost.c | 4 +-- > lib/vhost/vhost_user.c | 67 +++--- > 2 files changed, 58 insertions(+), 13 deletions(-) > > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c > index cd3297..53a470f547 100644 > --- a/lib/vhost/vhost.c > +++ b/lib/vhost/vhost.c [...] > @@ -1779,15 +1820,17 @@ vhost_check_queue_inflights_split(struct virtio_net > *dev, > vq->last_avail_idx += resubmit_num; > > if (resubmit_num) { > - resubmit = calloc(1, sizeof(struct rte_vhost_resubmit_info)); > + resubmit = rte_zmalloc_socket("resubmit", sizeof(struct > rte_vhost_resubmit_info), > + 0, vq->numa_node); > if (!resubmit) { > VHOST_LOG_CONFIG(ERR, > "failed to allocate memory for resubmit > info.\n"); > return RTE_VHOST_MSG_RESULT_ERR; > } > > - resubmit->resubmit_list = calloc(resubmit_num, > - sizeof(struct rte_vhost_resubmit_desc)); > + resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", > + resubmit_num * sizeof(struct > rte_vhost_resubmit_desc), > + 0, vq->numa_node); > if (!resubmit->resubmit_list) { > VHOST_LOG_CONFIG(ERR, > "failed to allocate memory for inflight > desc.\n"); > @@ -1873,15 +1916,17 @@ vhost_check_queue_inflights_packed(struct virtio_net > *dev, > } > > if (resubmit_num) { > - resubmit = calloc(1, sizeof(struct rte_vhost_resubmit_info)); > + resubmit = rte_zmalloc_socket("resubmit", sizeof(struct > rte_vhost_resubmit_info), > + 0, vq->numa_node); There are still two 'free(resubmit)' in vhost_check_queue_inflights_split and vhost_check_queue_inflights_packed, which should be replaced with rte_free() Thanks, Chenbo > if (resubmit == NULL) { > VHOST_LOG_CONFIG(ERR, > "failed to allocate memory for resubmit > info.\n"); > return RTE_VHOST_MSG_RESULT_ERR; > } > > - resubmit->resubmit_list = calloc(resubmit_num, > - sizeof(struct rte_vhost_resubmit_desc)); > + resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list", > + resubmit_num * sizeof(struct > rte_vhost_resubmit_desc), > + 0, vq->numa_node); > if (resubmit->resubmit_list == NULL) { > VHOST_LOG_CONFIG(ERR, > "failed to allocate memory for resubmit > desc.\n"); > -- > 2.31.1
Re: [dpdk-dev] [PATCH v6 5/7] vhost: improve NUMA reallocation
> -Original Message- > From: Maxime Coquelin > Sent: Friday, June 18, 2021 10:04 PM > To: dev@dpdk.org; david.march...@redhat.com; Xia, Chenbo > > Cc: Maxime Coquelin > Subject: [PATCH v6 5/7] vhost: improve NUMA reallocation > > This patch improves the numa_realloc() function by making use > of rte_realloc_socket(), which takes care of the memory copy > and freeing of the old data. > > Suggested-by: David Marchand > Signed-off-by: Maxime Coquelin > --- > lib/vhost/vhost_user.c | 186 ++--- > 1 file changed, 81 insertions(+), 105 deletions(-) > > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c > index 51b96a0716..d6ec4000c3 100644 > --- a/lib/vhost/vhost_user.c > +++ b/lib/vhost/vhost_user.c > @@ -480,16 +480,17 @@ vhost_user_set_vring_num(struct virtio_net **pdev, > static struct virtio_net* > numa_realloc(struct virtio_net *dev, int index) > { > - int oldnode, newnode; > + int node, dev_node; > struct virtio_net *old_dev; > - struct vhost_virtqueue *old_vq, *vq; > - struct vring_used_elem *new_shadow_used_split; > - struct vring_used_elem_packed *new_shadow_used_packed; > - struct batch_copy_elem *new_batch_copy_elems; > + struct vhost_virtqueue *vq; > + struct batch_copy_elem *bce; > + struct guest_page *gp; > + struct rte_vhost_memory *mem; > + size_t mem_size; > int ret; > > old_dev = dev; > - vq = old_vq = dev->virtqueue[index]; > + vq = dev->virtqueue[index]; > > /* >* If VQ is ready, it is too late to reallocate, it certainly already > @@ -498,128 +499,103 @@ numa_realloc(struct virtio_net *dev, int index) > if (vq->ready) > return dev; > > - ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, > - MPOL_F_NODE | MPOL_F_ADDR); > - > - /* check if we need to reallocate vq */ > - ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, > - MPOL_F_NODE | MPOL_F_ADDR); > + ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | > MPOL_F_ADDR); > if (ret) { > - VHOST_LOG_CONFIG(ERR, > - "Unable to get vq numa information.\n"); > + VHOST_LOG_CONFIG(ERR, "Unable to get virtqueue %d numa > information.\n", index); > return dev; > } > - if (oldnode != newnode) { > - VHOST_LOG_CONFIG(INFO, > - "reallocate vq from %d to %d node\n", oldnode, newnode); > - vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); > - if (!vq) > - return dev; > > - memcpy(vq, old_vq, sizeof(*vq)); > + vq = rte_realloc_socket(vq, sizeof(*vq), 0, node); > + if (!vq) { > + VHOST_LOG_CONFIG(ERR, "Failed to realloc virtqueue %d on > node %d\n", > + index, node); > + return dev; > + } > > - if (vq_is_packed(dev)) { > - new_shadow_used_packed = rte_malloc_socket(NULL, > - vq->size * > - sizeof(struct vring_used_elem_packed), > - RTE_CACHE_LINE_SIZE, > - newnode); > - if (new_shadow_used_packed) { > - rte_free(vq->shadow_used_packed); > - vq->shadow_used_packed = new_shadow_used_packed; > - } > - } else { > - new_shadow_used_split = rte_malloc_socket(NULL, > - vq->size * > - sizeof(struct vring_used_elem), > - RTE_CACHE_LINE_SIZE, > - newnode); > - if (new_shadow_used_split) { > - rte_free(vq->shadow_used_split); > - vq->shadow_used_split = new_shadow_used_split; > - } > + if (vq != dev->virtqueue[index]) { > + VHOST_LOG_CONFIG(INFO, "reallocated virtqueue on node %d\n", > node); > + dev->virtqueue[index] = vq; > + vhost_user_iotlb_init(dev, index); > + } > + > + if (vq_is_packed(dev)) { > + struct vring_used_elem_packed *sup; > + > + sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * > sizeof(*sup), > + RTE_CACHE_LINE_SIZE, node); > + if (!sup) { > + VHOST_LOG_CONFIG(ERR, "Failed to realloc shadow packed > on > node %d\n", node); > + return dev; > } > + vq->shadow_used_packed = sup; > + } else { > + struct vring_used_elem *sus; > > - new_batch_copy_elems = rte_malloc_socket(NULL, > - vq->size * sizeof(struct batch_c
Re: [dpdk-dev] [PATCH v6 6/7] vhost: allocate all data on same node as virtqueue
> -Original Message- > From: Maxime Coquelin > Sent: Friday, June 18, 2021 10:04 PM > To: dev@dpdk.org; david.march...@redhat.com; Xia, Chenbo > > Cc: Maxime Coquelin > Subject: [PATCH v6 6/7] vhost: allocate all data on same node as virtqueue > > This patch saves the NUMA node the virtqueue is allocated > on at init time, in order to allocate all other data on the > same node. > > While most of the data are allocated before numa_realloc() > is called and so the data will be reallocated properly, some > data like the log cache are most likely allocated after. > > For the virtio device metadata, we decide to allocate them > on the same node as the VQ 0. > > Signed-off-by: Maxime Coquelin > --- > lib/vhost/vhost.c | 34 -- > lib/vhost/vhost.h | 1 + > lib/vhost/vhost_user.c | 41 - > 3 files changed, 45 insertions(+), 31 deletions(-) > > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c > index c96f6335c8..cd3297 100644 > --- a/lib/vhost/vhost.c > +++ b/lib/vhost/vhost.c > @@ -261,7 +261,7 @@ vhost_alloc_copy_ind_table(struct virtio_net *dev, struct > vhost_virtqueue *vq, > uint64_t src, dst; > uint64_t len, remain = desc_len; > > - idesc = rte_malloc(__func__, desc_len, 0); > + idesc = rte_malloc_socket(__func__, desc_len, 0, vq->numa_node); > if (unlikely(!idesc)) > return NULL; > > @@ -549,6 +549,7 @@ static void > init_vring_queue(struct virtio_net *dev, uint32_t vring_idx) > { > struct vhost_virtqueue *vq; > + int numa_node = SOCKET_ID_ANY; > > if (vring_idx >= VHOST_MAX_VRING) { > VHOST_LOG_CONFIG(ERR, > @@ -570,6 +571,15 @@ init_vring_queue(struct virtio_net *dev, uint32_t > vring_idx) > vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; > vq->notif_enable = VIRTIO_UNINITIALIZED_NOTIF; > > +#ifdef RTE_LIBRTE_VHOST_NUMA > + if (get_mempolicy(&numa_node, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR)) { > + VHOST_LOG_CONFIG(ERR, "(%d) failed to query numa node: %s\n", > + dev->vid, rte_strerror(errno)); > + numa_node = SOCKET_ID_ANY; > + } > +#endif > + vq->numa_node = numa_node; > + > vhost_user_iotlb_init(dev, vring_idx); > } > > @@ -1616,7 +1626,6 @@ int rte_vhost_async_channel_register(int vid, uint16_t > queue_id, > struct vhost_virtqueue *vq; > struct virtio_net *dev = get_device(vid); > struct rte_vhost_async_features f; > - int node; > > if (dev == NULL || ops == NULL) > return -1; > @@ -1651,20 +1660,9 @@ int rte_vhost_async_channel_register(int vid, uint16_t > queue_id, > goto reg_out; > } > > -#ifdef RTE_LIBRTE_VHOST_NUMA > - if (get_mempolicy(&node, NULL, 0, vq, MPOL_F_NODE | MPOL_F_ADDR)) { > - VHOST_LOG_CONFIG(ERR, > - "unable to get numa information in async register. " > - "allocating async buffer memory on the caller thread > node\n"); > - node = SOCKET_ID_ANY; > - } > -#else > - node = SOCKET_ID_ANY; > -#endif > - > vq->async_pkts_info = rte_malloc_socket(NULL, > vq->size * sizeof(struct async_inflight_info), > - RTE_CACHE_LINE_SIZE, node); > + RTE_CACHE_LINE_SIZE, vq->numa_node); > if (!vq->async_pkts_info) { > vhost_free_async_mem(vq); > VHOST_LOG_CONFIG(ERR, > @@ -1675,7 +1673,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t > queue_id, > > vq->it_pool = rte_malloc_socket(NULL, > VHOST_MAX_ASYNC_IT * sizeof(struct rte_vhost_iov_iter), > - RTE_CACHE_LINE_SIZE, node); > + RTE_CACHE_LINE_SIZE, vq->numa_node); > if (!vq->it_pool) { > vhost_free_async_mem(vq); > VHOST_LOG_CONFIG(ERR, > @@ -1686,7 +1684,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t > queue_id, > > vq->vec_pool = rte_malloc_socket(NULL, > VHOST_MAX_ASYNC_VEC * sizeof(struct iovec), > - RTE_CACHE_LINE_SIZE, node); > + RTE_CACHE_LINE_SIZE, vq->numa_node); > if (!vq->vec_pool) { > vhost_free_async_mem(vq); > VHOST_LOG_CONFIG(ERR, > @@ -1698,7 +1696,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t > queue_id, > if (vq_is_packed(dev)) { > vq->async_buffers_packed = rte_malloc_socket(NULL, > vq->size * sizeof(struct vring_used_elem_packed), > - RTE_CACHE_LINE_SIZE, node); > + RTE_CACHE_LINE_SIZE, vq->numa_node); > if (!vq->async_buffers_packed) { > vhost_free_async_mem(vq); > VHOST_LOG_CONFIG(ERR, > @@ -1709,7 +1707,7 @@ int rte_vhost_async_channel_register(int vid, uint16_t >
Re: [dpdk-dev] [dpdk-stable] [PATCH] vdpa/mlx5: fix TSO offload without CSUM
> -Original Message- > From: stable On Behalf Of Xueming Li > Sent: Sunday, June 13, 2021 8:52 PM > Cc: dev@dpdk.org; xuemi...@nvidia.com; ma...@nvidia.com; sta...@dpdk.org; > Viacheslav Ovsiienko ; Maxime Coquelin > > Subject: [dpdk-stable] [PATCH] vdpa/mlx5: fix TSO offload without CSUM > > Packet was corrupted when TSO requested without CSUM update. > > Enables CSUM automatically if only TSO requested. > > Fixes: 2aa8444b0084 ("vdpa/mlx5: support stateless offloads") > Cc: ma...@nvidia.com > Cc: sta...@dpdk.org > > Signed-off-by: Xueming Li > --- > drivers/vdpa/mlx5/mlx5_vdpa_virtq.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c > b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c > index 024c5c4180..f530646058 100644 > --- a/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c > +++ b/drivers/vdpa/mlx5/mlx5_vdpa_virtq.c > @@ -442,6 +442,13 @@ mlx5_vdpa_virtqs_prepare(struct mlx5_vdpa_priv *priv) > DRV_LOG(ERR, "Failed to configure negotiated features."); > return -1; > } > + if ((priv->features & (1ULL << VIRTIO_NET_F_CSUM)) == 0 && > + ((priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO4)) > 0 || > + (priv->features & (1ULL << VIRTIO_NET_F_HOST_TSO6)) > 0)) { > + /* Packet may be corrupted if TSO is enabled without CSUM. */ > + DRV_LOG(INFO, "TSO is enabled without CSUM, force CSUM."); > + priv->features |= (1ULL << VIRTIO_NET_F_CSUM); > + } > if (nr_vring > priv->caps.max_num_virtio_queues * 2) { > DRV_LOG(ERR, "Do not support more than %d virtqs(%d).", > (int)priv->caps.max_num_virtio_queues * 2, > -- > 2.25.1 Reviewed-by: Chenbo Xia
Re: [dpdk-dev] Experimental symbols in bbdev lib
On Thu, Jun 24, 2021 at 12:35 PM Kinsella, Ray wrote: > > Hi Nicolas > > The following bbdev experimental symbols are present in both v21.05 and > v19.11 release. These symbols should be considered for promotion to stable as > part of the v22 ABI in DPDK 21.11, as they have been experimental for >= 2yrs > at this point. > > * rte_bbdev_allocate > * rte_bbdev_callback_register > * rte_bbdev_callback_unregister > * rte_bbdev_close > * rte_bbdev_count > * rte_bbdev_dec_op_alloc_bulk > * rte_bbdev_dec_op_free_bulk > * rte_bbdev_dequeue_dec_ops > * rte_bbdev_dequeue_enc_ops > * rte_bbdev_devices > * rte_bbdev_enc_op_alloc_bulk > * rte_bbdev_enc_op_free_bulk > * rte_bbdev_enqueue_dec_ops > * rte_bbdev_enqueue_enc_ops > * rte_bbdev_find_next > * rte_bbdev_get_named_dev > * rte_bbdev_info_get > * rte_bbdev_intr_enable > * rte_bbdev_is_valid > * rte_bbdev_op_pool_create > * rte_bbdev_op_type_str > * rte_bbdev_pmd_callback_process > * rte_bbdev_queue_configure > * rte_bbdev_queue_info_get > * rte_bbdev_queue_intr_ctl > * rte_bbdev_queue_intr_disable > * rte_bbdev_queue_intr_enable > * rte_bbdev_queue_start > * rte_bbdev_queue_stop > * rte_bbdev_release > * rte_bbdev_setup_queues > * rte_bbdev_start > * rte_bbdev_stats_get > * rte_bbdev_stats_reset > * rte_bbdev_stop Regardless of removing the experimental status on this API, part of the symbols listed here are driver-only and should be marked internal. -- David Marchand
Re: [dpdk-dev] Experimental symbols in compressdev lib
On Thu, Jun 24, 2021 at 12:33 PM Kinsella, Ray wrote: > > Hi Fiona & Ashish, > > The following compressdev experimental symbols are present in both v21.05 and > v19.11 release. These symbols should be considered for promotion to stable as > part of the v22 ABI in DPDK 21.11, as they have been experimental for >= 2yrs > at this point. > > * rte_compressdev_capability_get > * rte_compressdev_close > * rte_compressdev_configure > * rte_compressdev_count > * rte_compressdev_dequeue_burst > * rte_compressdev_devices_get > * rte_compressdev_enqueue_burst > * rte_compressdev_get_dev_id > * rte_compressdev_get_feature_name > * rte_compressdev_info_get > * rte_compressdev_name_get > * rte_compressdev_pmd_allocate > * rte_compressdev_pmd_create > * rte_compressdev_pmd_destroy > * rte_compressdev_pmd_get_named_dev > * rte_compressdev_pmd_parse_input_args > * rte_compressdev_pmd_release_device > * rte_compressdev_private_xform_create > * rte_compressdev_private_xform_free > * rte_compressdev_queue_pair_count > * rte_compressdev_queue_pair_setup > * rte_compressdev_socket_id > * rte_compressdev_start > * rte_compressdev_stats_get > * rte_compressdev_stats_reset > * rte_compressdev_stop > * rte_compressdev_stream_create > * rte_compressdev_stream_free > * rte_comp_get_feature_name > * rte_comp_op_alloc > * rte_comp_op_bulk_alloc > * rte_comp_op_bulk_free > * rte_comp_op_free > * rte_comp_op_pool_create > Part of the symbols listed here are driver-only (at least the *_pmd_* symbols) and should be marked internal. -- David Marchand
Re: [dpdk-dev] [PATCH v1] doc: update ABI in MAINTAINERS file
On 6/22/2021 4:50 PM, Ray Kinsella wrote: > Update to ABI MAINTAINERS. > > Signed-off-by: Ray Kinsella > --- > MAINTAINERS | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/MAINTAINERS b/MAINTAINERS > index 5877a16971..dab8883a4f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -117,7 +117,6 @@ F: .ci/ > > ABI Policy & Versioning > M: Ray Kinsella > -M: Neil Horman > F: lib/eal/include/rte_compat.h > F: lib/eal/include/rte_function_versioning.h > F: doc/guides/contributing/abi_*.rst > Acked-by: Ferruh Yigit Tried to reach out Neil multiple times for ABI issues without success.
Re: [dpdk-dev] [PATCH] eal/windows: ensure all the CPUs in the set are checked
2021-06-24 17:27 (UTC-0700), Narcisa Ana Maria Vasile: > From: Narcisa Vasile > > Fix count_cpu() to ensure it iterates through all the CPUs in a set. > count_cpu() iterates through the CPUs in the set 's' and counts the > selected ones. > > Previously, it was incorrectly using the number of CPUSETS to iterate > through the CPUs. > > Signed-off-by: Narcisa Vasile > --- > lib/eal/windows/include/sched.h | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/lib/eal/windows/include/sched.h b/lib/eal/windows/include/sched.h > index ff572b5dcb..bc31cc8465 100644 > --- a/lib/eal/windows/include/sched.h > +++ b/lib/eal/windows/include/sched.h > @@ -49,7 +49,7 @@ count_cpu(rte_cpuset_t *s) > unsigned int _i; > int count = 0; > > - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) > + for (_i = 0; _i < CPU_SETSIZE; _i++) > if (CPU_ISSET(_i, s) != 0LL) > count++; > return count; Hi Naty, Thank you for the fix, but we also need a proper commit message: https://doc.dpdk.org/guides/contributing/patches.html#commit-messages-body Specifically, please, describe what was the observable issue (usually first comes what was wrong, then how it is fixed now) and add "Fixes" tag and Cc. Also, "number of CPUSETS" sounds unclear, as there's no "CPUSET". Suggestion: "number of bitset limbs" or maybe if you describe what was wrong with the result you won't need to describe its reason precisely at all.
Re: [dpdk-dev] Experimental symbols in compressdev lib
On 25/06/2021 08:49, David Marchand wrote: > On Thu, Jun 24, 2021 at 12:33 PM Kinsella, Ray wrote: >> >> Hi Fiona & Ashish, >> >> The following compressdev experimental symbols are present in both v21.05 >> and v19.11 release. These symbols should be considered for promotion to >> stable as part of the v22 ABI in DPDK 21.11, as they have been experimental >> for >= 2yrs at this point. >> >> * rte_compressdev_capability_get >> * rte_compressdev_close >> * rte_compressdev_configure >> * rte_compressdev_count >> * rte_compressdev_dequeue_burst >> * rte_compressdev_devices_get >> * rte_compressdev_enqueue_burst >> * rte_compressdev_get_dev_id >> * rte_compressdev_get_feature_name >> * rte_compressdev_info_get >> * rte_compressdev_name_get >> * rte_compressdev_pmd_allocate >> * rte_compressdev_pmd_create >> * rte_compressdev_pmd_destroy >> * rte_compressdev_pmd_get_named_dev >> * rte_compressdev_pmd_parse_input_args >> * rte_compressdev_pmd_release_device >> * rte_compressdev_private_xform_create >> * rte_compressdev_private_xform_free >> * rte_compressdev_queue_pair_count >> * rte_compressdev_queue_pair_setup >> * rte_compressdev_socket_id >> * rte_compressdev_start >> * rte_compressdev_stats_get >> * rte_compressdev_stats_reset >> * rte_compressdev_stop >> * rte_compressdev_stream_create >> * rte_compressdev_stream_free >> * rte_comp_get_feature_name >> * rte_comp_op_alloc >> * rte_comp_op_bulk_alloc >> * rte_comp_op_bulk_free >> * rte_comp_op_free >> * rte_comp_op_pool_create >> > > Part of the symbols listed here are driver-only (at least the *_pmd_* > symbols) and should be marked internal. > +1 agreed.
[dpdk-dev] [PATCH v3 0/5] Enable ETS-based Tx QoS for VF in DCF
This patch enables the ETS-based Tx QoS for IAVF. Kernel tool is used to configure ETS first. DCF is used to set bandwidth limit for VFs of each TC. IAVF is supported to query QoS capability and set queue TC mapping. Traffic Management API is utilized to configure the QoS hierarchy scheduler tree. The scheduler tree will be passed to hardware to enable all above functions. Ting Xu (5): common/iavf: support ETS-based QoS offload configuration net/ice/base: support DCF query port ETS adminq net/ice: support DCF link status event handling net/ice: support QoS config VF bandwidth in DCF net/iavf: query QoS cap and set queue TC mapping drivers/common/iavf/iavf_type.h | 2 + drivers/common/iavf/virtchnl.h | 131 ++ drivers/net/iavf/iavf.h | 45 ++ drivers/net/iavf/iavf_ethdev.c | 31 ++ drivers/net/iavf/iavf_tm.c | 667 + drivers/net/iavf/iavf_vchnl.c| 56 ++- drivers/net/iavf/meson.build | 1 + drivers/net/ice/base/ice_dcb.c | 3 +- drivers/net/ice/ice_dcf.c| 6 +- drivers/net/ice/ice_dcf.h| 53 +++ drivers/net/ice/ice_dcf_ethdev.c | 67 ++- drivers/net/ice/ice_dcf_ethdev.h | 3 + drivers/net/ice/ice_dcf_parent.c | 81 drivers/net/ice/ice_dcf_sched.c | 697 +++ drivers/net/ice/meson.build | 3 +- 15 files changed, 1839 insertions(+), 7 deletions(-) create mode 100644 drivers/net/iavf/iavf_tm.c create mode 100644 drivers/net/ice/ice_dcf_sched.c -- 2.25.1
[dpdk-dev] [PATCH v3 1/5] common/iavf: support ETS-based QoS offload configuration
This patch adds new virtchnl opcodes and structures for QoS configuration, which includes: 1. VIRTCHNL_VF_OFFLOAD_TC, to negotiate the capability supporting QoS configuration. If VF and PF both have this flag, then the ETS-based QoS offload function is supported. 2. VIRTCHNL_OP_DCF_CONFIG_BW, DCF is supposed to configure min and max bandwidth for each VF per enabled TCs. To make the VSI node bandwidth configuration work, DCF also needs to configure TC node bandwidth directly. 3. VIRTCHNL_OP_GET_QOS_CAPS, VF queries current QoS configuration, such as enabled TCs, arbiter type, up2tc and bandwidth of VSI node. The configuration is previously set by DCB and DCF, and now is the potential QoS capability of VF. VF can take it as reference to configure queue TC mapping. 4. VIRTCHNL_OP_CONFIG_TC_MAP, set VF queues to TC mapping for all Tx and Rx queues. Queues mapping to one TC should be continuous and all allocated queues should be mapped. Signed-off-by: Ting Xu --- drivers/common/iavf/iavf_type.h | 2 + drivers/common/iavf/virtchnl.h | 131 2 files changed, 133 insertions(+) diff --git a/drivers/common/iavf/iavf_type.h b/drivers/common/iavf/iavf_type.h index f3815d523b..73dfb47e70 100644 --- a/drivers/common/iavf/iavf_type.h +++ b/drivers/common/iavf/iavf_type.h @@ -141,6 +141,8 @@ enum iavf_debug_mask { #define IAVF_PHY_LED_MODE_MASK 0x #define IAVF_PHY_LED_MODE_ORIG 0x8000 +#define IAVF_MAX_TRAFFIC_CLASS 8 + /* Memory types */ enum iavf_memset_type { IAVF_NONDMA_MEM = 0, diff --git a/drivers/common/iavf/virtchnl.h b/drivers/common/iavf/virtchnl.h index 197edce8a1..1cf0866124 100644 --- a/drivers/common/iavf/virtchnl.h +++ b/drivers/common/iavf/virtchnl.h @@ -85,6 +85,10 @@ enum virtchnl_rx_hsplit { VIRTCHNL_RX_HSPLIT_SPLIT_SCTP= 8, }; +enum virtchnl_bw_limit_type { + VIRTCHNL_BW_SHAPER = 0, +}; + #define VIRTCHNL_ETH_LENGTH_OF_ADDRESS 6 /* END GENERIC DEFINES */ @@ -130,6 +134,7 @@ enum virtchnl_ops { VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, /* opcodes 34, 35, 36, and 37 are reserved */ + VIRTCHNL_OP_DCF_CONFIG_BW = 37, VIRTCHNL_OP_DCF_VLAN_OFFLOAD = 38, VIRTCHNL_OP_DCF_CMD_DESC = 39, VIRTCHNL_OP_DCF_CMD_BUFF = 40, @@ -152,6 +157,8 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_VLAN_INSERTION_V2 = 57, VIRTCHNL_OP_ENABLE_VLAN_FILTERING_V2 = 58, VIRTCHNL_OP_DISABLE_VLAN_FILTERING_V2 = 59, + VIRTCHNL_OP_GET_QOS_CAPS = 66, + VIRTCHNL_OP_CONFIG_QUEUE_TC_MAP = 67, VIRTCHNL_OP_ENABLE_QUEUES_V2 = 107, VIRTCHNL_OP_DISABLE_QUEUES_V2 = 108, VIRTCHNL_OP_MAP_QUEUE_VECTOR = 111, @@ -398,6 +405,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC BIT(26) #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PFBIT(28) +#define VIRTCHNL_VF_OFFLOAD_QOSBIT(29) #define VIRTCHNL_VF_CAP_DCFBIT(30) /* BIT(31) is reserved */ @@ -1285,6 +1293,14 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); +struct virtchnl_shaper_bw { + /* Unit is Kbps */ + u32 committed; + u32 peak; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_shaper_bw); + /* VIRTCHNL_OP_DCF_GET_VSI_MAP * VF sends this message to get VSI mapping table. * PF responds with an indirect message containing VF's @@ -1357,6 +1373,37 @@ struct virtchnl_dcf_vlan_offload { VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_dcf_vlan_offload); +struct virtchnl_dcf_bw_cfg { + u8 tc_num; +#define VIRTCHNL_DCF_BW_CIRBIT(0) +#define VIRTCHNL_DCF_BW_PIRBIT(1) + u8 bw_type; + u8 pad[2]; + enum virtchnl_bw_limit_type type; + union { + struct virtchnl_shaper_bw shaper; + u8 pad2[32]; + }; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(40, virtchnl_dcf_bw_cfg); + +/* VIRTCHNL_OP_DCF_CONFIG_BW + * VF send this message to set the bandwidth configuration of each + * TC with a specific vf id. The flag node_type is to indicate that + * this message is to configure VSI node or TC node bandwidth. + */ +struct virtchnl_dcf_bw_cfg_list { + u16 vf_id; + u8 num_elem; +#define VIRTCHNL_DCF_TARGET_TC_BW 0 +#define VIRTCHNL_DCF_TARGET_VF_BW 1 + u8 node_type; + struct virtchnl_dcf_bw_cfg cfg[1]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(44, virtchnl_dcf_bw_cfg_list); + struct virtchnl_supported_rxdids { /* see enum virtchnl_rx_desc_id_bitmasks */ u64 supported_rxdids; @@ -1768,6 +1815,62 @@ struct virtchnl_fdir_del { VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del); +/* VIRTCHNL_OP_GET_QOS_CAPS + * VF sends this message to get its QoS Caps, such as + * TC number, Arbiter and Bandwidth. + */ +struct virtchnl_qos_cap_elem { + u8 tc_num; +
[dpdk-dev] [PATCH v3 2/5] net/ice/base: support DCF query port ETS adminq
In the adminq command query port ETS function, the root node teid is needed. However, for DCF, the root node is not initialized, which will cause error when we refer to the variable. In this patch, we will check whether the root node is available or not first. Signed-off-by: Ting Xu --- drivers/net/ice/base/ice_dcb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ice/base/ice_dcb.c b/drivers/net/ice/base/ice_dcb.c index c73fc095ff..9c9675f6ef 100644 --- a/drivers/net/ice/base/ice_dcb.c +++ b/drivers/net/ice/base/ice_dcb.c @@ -1524,7 +1524,8 @@ ice_aq_query_port_ets(struct ice_port_info *pi, return ICE_ERR_PARAM; cmd = &desc.params.port_ets; ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_port_ets); - cmd->port_teid = pi->root->info.node_teid; + if (pi->root) + cmd->port_teid = pi->root->info.node_teid; status = ice_aq_send_cmd(pi->hw, &desc, buf, buf_size, cd); return status; -- 2.25.1
[dpdk-dev] [PATCH v3 3/5] net/ice: support DCF link status event handling
When link status changes, DCF will receive virtchnl PF event message. Add support to handle this event, change link status and update link info. Signed-off-by: Ting Xu --- drivers/net/ice/ice_dcf.h| 6 drivers/net/ice/ice_dcf_ethdev.c | 54 ++-- drivers/net/ice/ice_dcf_parent.c | 51 ++ 3 files changed, 108 insertions(+), 3 deletions(-) diff --git a/drivers/net/ice/ice_dcf.h b/drivers/net/ice/ice_dcf.h index 0cb90b5e9f..587093b909 100644 --- a/drivers/net/ice/ice_dcf.h +++ b/drivers/net/ice/ice_dcf.h @@ -60,6 +60,10 @@ struct ice_dcf_hw { uint16_t nb_msix; uint16_t rxq_map[16]; struct virtchnl_eth_stats eth_stats_offset; + + /* Link status */ + bool link_up; + uint32_t link_speed; }; int ice_dcf_execute_virtchnl_cmd(struct ice_dcf_hw *hw, @@ -77,5 +81,7 @@ int ice_dcf_disable_queues(struct ice_dcf_hw *hw); int ice_dcf_query_stats(struct ice_dcf_hw *hw, struct virtchnl_eth_stats *pstats); int ice_dcf_add_del_all_mac_addr(struct ice_dcf_hw *hw, bool add); +int ice_dcf_link_update(struct rte_eth_dev *dev, + __rte_unused int wait_to_complete); #endif /* _ICE_DCF_H_ */ diff --git a/drivers/net/ice/ice_dcf_ethdev.c b/drivers/net/ice/ice_dcf_ethdev.c index f73dc80bd9..0b40ebbec6 100644 --- a/drivers/net/ice/ice_dcf_ethdev.c +++ b/drivers/net/ice/ice_dcf_ethdev.c @@ -881,11 +881,59 @@ ice_dcf_dev_close(struct rte_eth_dev *dev) return 0; } -static int -ice_dcf_link_update(__rte_unused struct rte_eth_dev *dev, +int +ice_dcf_link_update(struct rte_eth_dev *dev, __rte_unused int wait_to_complete) { - return 0; + struct ice_dcf_adapter *ad = dev->data->dev_private; + struct ice_dcf_hw *hw = &ad->real_hw; + struct rte_eth_link new_link; + + memset(&new_link, 0, sizeof(new_link)); + + /* Only read status info stored in VF, and the info is updated +* when receive LINK_CHANGE event from PF by virtchnl. +*/ + switch (hw->link_speed) { + case 10: + new_link.link_speed = ETH_SPEED_NUM_10M; + break; + case 100: + new_link.link_speed = ETH_SPEED_NUM_100M; + break; + case 1000: + new_link.link_speed = ETH_SPEED_NUM_1G; + break; + case 1: + new_link.link_speed = ETH_SPEED_NUM_10G; + break; + case 2: + new_link.link_speed = ETH_SPEED_NUM_20G; + break; + case 25000: + new_link.link_speed = ETH_SPEED_NUM_25G; + break; + case 4: + new_link.link_speed = ETH_SPEED_NUM_40G; + break; + case 5: + new_link.link_speed = ETH_SPEED_NUM_50G; + break; + case 10: + new_link.link_speed = ETH_SPEED_NUM_100G; + break; + default: + new_link.link_speed = ETH_SPEED_NUM_NONE; + break; + } + + new_link.link_duplex = ETH_LINK_FULL_DUPLEX; + new_link.link_status = hw->link_up ? ETH_LINK_UP : +ETH_LINK_DOWN; + new_link.link_autoneg = !(dev->data->dev_conf.link_speeds & + ETH_LINK_SPEED_FIXED); + + return rte_eth_linkstatus_set(dev, &new_link); } /* Add UDP tunneling port */ diff --git a/drivers/net/ice/ice_dcf_parent.c b/drivers/net/ice/ice_dcf_parent.c index 19420a0f58..788f6dd2a0 100644 --- a/drivers/net/ice/ice_dcf_parent.c +++ b/drivers/net/ice/ice_dcf_parent.c @@ -178,6 +178,44 @@ start_vsi_reset_thread(struct ice_dcf_hw *dcf_hw, bool vfr, uint16_t vf_id) } } +static uint32_t +ice_dcf_convert_link_speed(enum virtchnl_link_speed virt_link_speed) +{ + uint32_t speed; + + switch (virt_link_speed) { + case VIRTCHNL_LINK_SPEED_100MB: + speed = 100; + break; + case VIRTCHNL_LINK_SPEED_1GB: + speed = 1000; + break; + case VIRTCHNL_LINK_SPEED_10GB: + speed = 1; + break; + case VIRTCHNL_LINK_SPEED_40GB: + speed = 4; + break; + case VIRTCHNL_LINK_SPEED_20GB: + speed = 2; + break; + case VIRTCHNL_LINK_SPEED_25GB: + speed = 25000; + break; + case VIRTCHNL_LINK_SPEED_2_5GB: + speed = 2500; + break; + case VIRTCHNL_LINK_SPEED_5GB: + speed = 5000; + break; + default: + speed = 0; + break; + } + + return speed; +} + void ice_dcf_handle_pf_event_msg(struct ice_dcf_hw *dcf_hw, uint8_t *msg, uint16_t msglen) @@ -196,6 +234,19 @@ ice_dcf_handle_pf_event_msg(struct ice_dcf_hw *dcf_hw,
[dpdk-dev] [PATCH v3 4/5] net/ice: support QoS config VF bandwidth in DCF
This patch supports the ETS-based QoS configuration. It enables the DCF to configure bandwidth limits for each VF VSI of different TCs. A hierarchy scheduler tree is built with port, TC and VSI nodes. Signed-off-by: Qiming Yang Signed-off-by: Ting Xu --- drivers/net/ice/ice_dcf.c| 6 +- drivers/net/ice/ice_dcf.h| 47 +++ drivers/net/ice/ice_dcf_ethdev.c | 13 + drivers/net/ice/ice_dcf_ethdev.h | 3 + drivers/net/ice/ice_dcf_parent.c | 30 ++ drivers/net/ice/ice_dcf_sched.c | 697 +++ drivers/net/ice/meson.build | 3 +- 7 files changed, 797 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ice/ice_dcf_sched.c diff --git a/drivers/net/ice/ice_dcf.c b/drivers/net/ice/ice_dcf.c index d72a6f357e..4ff2216a5c 100644 --- a/drivers/net/ice/ice_dcf.c +++ b/drivers/net/ice/ice_dcf.c @@ -235,7 +235,8 @@ ice_dcf_get_vf_resource(struct ice_dcf_hw *hw) caps = VIRTCHNL_VF_OFFLOAD_WB_ON_ITR | VIRTCHNL_VF_OFFLOAD_RX_POLLING | VIRTCHNL_VF_CAP_ADV_LINK_SPEED | VIRTCHNL_VF_CAP_DCF | VIRTCHNL_VF_OFFLOAD_VLAN_V2 | - VF_BASE_MODE_OFFLOADS | VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC; + VF_BASE_MODE_OFFLOADS | VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC | + VIRTCHNL_VF_OFFLOAD_QOS; err = ice_dcf_send_cmd_req_no_irq(hw, VIRTCHNL_OP_GET_VF_RESOURCES, (uint8_t *)&caps, sizeof(caps)); @@ -668,6 +669,9 @@ ice_dcf_init_hw(struct rte_eth_dev *eth_dev, struct ice_dcf_hw *hw) } } + if (hw->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_QOS) + ice_dcf_tm_conf_init(eth_dev); + hw->eth_dev = eth_dev; rte_intr_callback_register(&pci_dev->intr_handle, ice_dcf_dev_interrupt_handler, hw); diff --git a/drivers/net/ice/ice_dcf.h b/drivers/net/ice/ice_dcf.h index 587093b909..e74e5d7e81 100644 --- a/drivers/net/ice/ice_dcf.h +++ b/drivers/net/ice/ice_dcf.h @@ -6,6 +6,7 @@ #define _ICE_DCF_H_ #include +#include #include #include @@ -30,6 +31,49 @@ struct dcf_virtchnl_cmd { volatile int pending; }; +struct ice_dcf_tm_shaper_profile { + TAILQ_ENTRY(ice_dcf_tm_shaper_profile) node; + uint32_t shaper_profile_id; + uint32_t reference_count; + struct rte_tm_shaper_params profile; +}; + +TAILQ_HEAD(ice_dcf_shaper_profile_list, ice_dcf_tm_shaper_profile); + +/* Struct to store Traffic Manager node configuration. */ +struct ice_dcf_tm_node { + TAILQ_ENTRY(ice_dcf_tm_node) node; + uint32_t id; + uint32_t tc; + uint32_t priority; + uint32_t weight; + uint32_t reference_count; + struct ice_dcf_tm_node *parent; + struct ice_dcf_tm_shaper_profile *shaper_profile; + struct rte_tm_node_params params; +}; + +TAILQ_HEAD(ice_dcf_tm_node_list, ice_dcf_tm_node); + +/* node type of Traffic Manager */ +enum ice_dcf_tm_node_type { + ICE_DCF_TM_NODE_TYPE_PORT, + ICE_DCF_TM_NODE_TYPE_TC, + ICE_DCF_TM_NODE_TYPE_VSI, + ICE_DCF_TM_NODE_TYPE_MAX, +}; + +/* Struct to store all the Traffic Manager configuration. */ +struct ice_dcf_tm_conf { + struct ice_dcf_shaper_profile_list shaper_profile_list; + struct ice_dcf_tm_node *root; /* root node - port */ + struct ice_dcf_tm_node_list tc_list; /* node list for all the TCs */ + struct ice_dcf_tm_node_list vsi_list; /* node list for all the queues */ + uint32_t nb_tc_node; + uint32_t nb_vsi_node; + bool committed; +}; + struct ice_dcf_hw { struct iavf_hw avf; @@ -45,6 +89,8 @@ struct ice_dcf_hw { uint16_t *vf_vsi_map; uint16_t pf_vsi_id; + struct ice_dcf_tm_conf tm_conf; + struct ice_aqc_port_ets_elem *ets_config; struct virtchnl_version_info virtchnl_version; struct virtchnl_vf_resource *vf_res; /* VF resource */ struct virtchnl_vsi_resource *vsi_res; /* LAN VSI */ @@ -83,5 +129,6 @@ int ice_dcf_query_stats(struct ice_dcf_hw *hw, int ice_dcf_add_del_all_mac_addr(struct ice_dcf_hw *hw, bool add); int ice_dcf_link_update(struct rte_eth_dev *dev, __rte_unused int wait_to_complete); +void ice_dcf_tm_conf_init(struct rte_eth_dev *dev); #endif /* _ICE_DCF_H_ */ diff --git a/drivers/net/ice/ice_dcf_ethdev.c b/drivers/net/ice/ice_dcf_ethdev.c index 0b40ebbec6..cab7c4da87 100644 --- a/drivers/net/ice/ice_dcf_ethdev.c +++ b/drivers/net/ice/ice_dcf_ethdev.c @@ -994,6 +994,18 @@ ice_dcf_dev_udp_tunnel_port_del(struct rte_eth_dev *dev, return ret; } +static int +ice_dcf_tm_ops_get(struct rte_eth_dev *dev __rte_unused, + void *arg) +{ + if (!arg) + return -EINVAL; + + *(const void **)arg = &ice_dcf_tm_ops; + + return 0; +} + static const struct eth_dev_ops ice_dcf_eth_dev_ops = { .dev_start = ice_dcf_dev_start, .dev_stop= ic
[dpdk-dev] [PATCH v3 5/5] net/iavf: query QoS cap and set queue TC mapping
This patch added the support for VF to config the ETS-based Tx QoS, including querying current QoS configuration from PF and config queue TC mapping. PF QoS is configured in advance and the queried info is provided to the user for future usage. VF queues are mapped to different TCs in PF through virtchnl. Signed-off-by: Qiming Yang Signed-off-by: Ting Xu --- drivers/net/iavf/iavf.h| 45 +++ drivers/net/iavf/iavf_ethdev.c | 31 ++ drivers/net/iavf/iavf_tm.c | 667 + drivers/net/iavf/iavf_vchnl.c | 56 ++- drivers/net/iavf/meson.build | 1 + 5 files changed, 799 insertions(+), 1 deletion(-) create mode 100644 drivers/net/iavf/iavf_tm.c diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h index 4f5811ae87..77ddf15f42 100644 --- a/drivers/net/iavf/iavf.h +++ b/drivers/net/iavf/iavf.h @@ -6,6 +6,8 @@ #define _IAVF_ETHDEV_H_ #include +#include + #include #include #include @@ -82,6 +84,8 @@ #define IAVF_RX_DESC_EXT_STATUS_FLEXBH_MASK 0x03 #define IAVF_RX_DESC_EXT_STATUS_FLEXBH_FD_ID 0x01 +#define IAVF_BITS_PER_BYTE 8 + struct iavf_adapter; struct iavf_rx_queue; struct iavf_tx_queue; @@ -129,6 +133,38 @@ enum iavf_aq_result { IAVF_MSG_CMD, /* Read async command result */ }; +/* Struct to store Traffic Manager node configuration. */ +struct iavf_tm_node { + TAILQ_ENTRY(iavf_tm_node) node; + uint32_t id; + uint32_t tc; + uint32_t priority; + uint32_t weight; + uint32_t reference_count; + struct iavf_tm_node *parent; + struct rte_tm_node_params params; +}; + +TAILQ_HEAD(iavf_tm_node_list, iavf_tm_node); + +/* node type of Traffic Manager */ +enum iavf_tm_node_type { + IAVF_TM_NODE_TYPE_PORT, + IAVF_TM_NODE_TYPE_TC, + IAVF_TM_NODE_TYPE_QUEUE, + IAVF_TM_NODE_TYPE_MAX, +}; + +/* Struct to store all the Traffic Manager configuration. */ +struct iavf_tm_conf { + struct iavf_tm_node *root; /* root node - vf vsi */ + struct iavf_tm_node_list tc_list; /* node list for all the TCs */ + struct iavf_tm_node_list queue_list; /* node list for all the queues */ + uint32_t nb_tc_node; + uint32_t nb_queue_node; + bool committed; +}; + /* Structure to store private data specific for VF instance. */ struct iavf_info { uint16_t num_queue_pairs; @@ -175,6 +211,9 @@ struct iavf_info { struct iavf_fdir_info fdir; /* flow director info */ /* indicate large VF support enabled or not */ bool lv_enabled; + + struct virtchnl_qos_cap_list *qos_cap; + struct iavf_tm_conf tm_conf; }; #define IAVF_MAX_PKT_TYPE 1024 @@ -344,4 +383,10 @@ int iavf_add_del_mc_addr_list(struct iavf_adapter *adapter, uint32_t mc_addrs_num, bool add); int iavf_request_queues(struct iavf_adapter *adapter, uint16_t num); int iavf_get_max_rss_queue_region(struct iavf_adapter *adapter); +int iavf_get_qos_cap(struct iavf_adapter *adapter); +int iavf_set_q_tc_map(struct rte_eth_dev *dev, + struct virtchnl_queue_tc_mapping *q_tc_mapping, + uint16_t size); +void iavf_tm_conf_init(struct rte_eth_dev *dev); +extern const struct rte_tm_ops iavf_tm_ops; #endif /* _IAVF_ETHDEV_H_ */ diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c index 5290588b17..c8ee1a834b 100644 --- a/drivers/net/iavf/iavf_ethdev.c +++ b/drivers/net/iavf/iavf_ethdev.c @@ -122,6 +122,7 @@ static int iavf_dev_flow_ops_get(struct rte_eth_dev *dev, static int iavf_set_mc_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr *mc_addrs, uint32_t mc_addrs_num); +static int iavf_tm_ops_get(struct rte_eth_dev *dev __rte_unused, void *arg); static const struct rte_pci_id pci_id_iavf_map[] = { { RTE_PCI_DEVICE(IAVF_INTEL_VENDOR_ID, IAVF_DEV_ID_ADAPTIVE_VF) }, @@ -200,8 +201,21 @@ static const struct eth_dev_ops iavf_eth_dev_ops = { .flow_ops_get = iavf_dev_flow_ops_get, .tx_done_cleanup= iavf_dev_tx_done_cleanup, .get_monitor_addr = iavf_get_monitor_addr, + .tm_ops_get = iavf_tm_ops_get, }; +static int +iavf_tm_ops_get(struct rte_eth_dev *dev __rte_unused, + void *arg) +{ + if (!arg) + return -EINVAL; + + *(const void **)arg = &iavf_tm_ops; + + return 0; +} + static int iavf_set_mc_addr_list(struct rte_eth_dev *dev, struct rte_ether_addr *mc_addrs, @@ -806,6 +820,11 @@ iavf_dev_start(struct rte_eth_dev *dev) dev->data->nb_tx_queues); num_queue_pairs = vf->num_queue_pairs; + if (iavf_get_qos_cap(adapter)) { + PMD_INIT_LOG(ERR, "Failed to get qos capability"); + return -1; + } + if (iavf_init_queues(dev) != 0) { PMD_DRV_LOG(ERR, "failed t
[dpdk-dev] 回复: [PATCH v1 1/2] net/i40e: improve performance for scalar Tx
> > int n = txq->tx_rs_thresh; > > int32_t i = 0, j = 0; > > const int32_t k = RTE_ALIGN_FLOOR(n, RTE_I40E_TX_MAX_FREE_BUF_SZ); > > const int32_t m = n % RTE_I40E_TX_MAX_FREE_BUF_SZ; struct rte_mbuf > > *free[RTE_I40E_TX_MAX_FREE_BUF_SZ]; > > > > For FAST_FREE_MODE: > > > > if (k) { > > for (j = 0; j != k - RTE_I40E_TX_MAX_FREE_BUF_SZ; > > j += RTE_I40E_TX_MAX_FREE_BUF_SZ) { > > for (i = 0; i > free[i] = txep->mbuf; > > txep->mbuf = NULL; > > } > > rte_mempool_put_bulk(free[0]->pool, (void **)free, > > RTE_I40E_TX_MAX_FREE_BUF_SZ); > > } > > } > > > > if (m) { > > for (i = 0; i < m; ++i, ++txep) { > > free[i] = txep->mbuf; > > txep->mbuf = NULL; > > } > > } > > rte_mempool_put_bulk(free[0]->pool, (void **)free, m); } > Seems no logical problem, but the code looks heavy due to for loops. > Did you run performance with this change when tx_rs_thresh > > RTE_I40E_TX_MAX_FREE_BUF_SZ? Sorry for my late rely. It takes me some time to do the test for this path and following is my test results: First, I come up with another way to solve this bug and compare it with "loop"(size of 'free' is 64). That is set the size of 'free' as a large constant. We know: tx_rs_thresh < ring_desc_size < I40E_MAX_RING_DESC(4096), so we can directly define as: struct rte_mbuf *free[RTE_I40E_TX_MAX_FREE_BUF_SZ]; [1]Test Config: MRR Test: two porst & bi-directional flows & one core RX API: i40e_recv_pkts_bulk_alloc TX API: i40e_xmit_pkts_simple ring_descs_size: 1024 Ring_I40E_TX_MAX_FREE_SZ: 64 [2]Scheme: tx_rs_thresh = I40E_DEFAULT_TX_RSBIT_THRESH tx_free_thresh = I40E_DEFAULT_TX_FREE_THRESH tx_rs_thresh <= tx_free_thresh < nb_tx_desc So we change the value of 'tx_rs_thresh' by adjust I40E_DEFAULT_TX_RSBIT_THRESH [3]Test Results (performance improve): In X86: tx_rs_thresh/ tx_free_thresh 32/32 256/256 512/512 1.mempool_put(base) 0 0 0 2.mempool_put_bulk:loop +4.7% +5.6% +7.0% 3.mempool_put_bulk:large size for free +3.8% +2.3% -2.0% (free[I40E_MAX_RING_DESC]) In Arm: N1SDP: tx_rs_thresh/ tx_free_thresh 32/32 256/256 512/512 1.mempool_put(base) 0 0 0 2.mempool_put_bulk:loop +7.9% +9.1% +2.9% 3.mempool_put_bulk:large size for free+7.1% +8.7% +3.4% (free[I40E_MAX_RING_DESC]) Thunderx2: tx_rs_thresh/ tx_free_thresh 32/32 256/256 512/512 1.mempool_put(base) 0 0 0 2.mempool_put_bulk:loop +7.6% +10.5% +7.6% 3.mempool_put_bulk:large size for free+1.7% +18.4% +10.2% (free[I40E_MAX_RING_DESC]) As a result, I feel maybe 'loop' is better and it seems not very heavy according to the test. What about your views and look forward to your reply. Thanks a lot.
[dpdk-dev] [PATCH RESEND] eal: allow hugetlbfs sub-directories
get_hugepage_dir() was implemented in such a way that a --huge-dir option had to exactly match the mountpoint, but there's no reason for this restriction. Fix the implementation to allow a sub-directory within a suitable hugetlbfs mountpoint to be specified. Signed-off-by: John Levon --- lib/eal/linux/eal_hugepage_info.c | 25 +++-- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/lib/eal/linux/eal_hugepage_info.c b/lib/eal/linux/eal_hugepage_info.c index d97792cad..d7e9918f8 100644 --- a/lib/eal/linux/eal_hugepage_info.c +++ b/lib/eal/linux/eal_hugepage_info.c @@ -226,16 +226,29 @@ get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) default_size = get_default_hp_size(); while (fgets(buf, sizeof(buf), fd)){ + const char *dir; + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, split_tok) != _FIELDNAME_MAX) { RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); break; /* return NULL */ } - /* we have a specified --huge-dir option, only examine that dir */ - if (internal_conf->hugepage_dir != NULL && - strcmp(splitstr[MOUNTPT], internal_conf->hugepage_dir) != 0) - continue; + dir = splitstr[MOUNTPT]; + + /* +* If a --huge-dir option has been specified, only examine +* mounts that contain that directory, and make sure to return +* the directory, not the mount. +*/ + if (internal_conf->hugepage_dir != NULL) { + if (strncmp(internal_conf->hugepage_dir, + splitstr[MOUNTPT], + strlen(splitstr[MOUNTPT])) != 0) + continue; + + dir = internal_conf->hugepage_dir; + } if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); @@ -243,7 +256,7 @@ get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) /* if no explicit page size, the default page size is compared */ if (pagesz_str == NULL){ if (hugepage_sz == default_size){ - strlcpy(hugedir, splitstr[MOUNTPT], len); + strlcpy(hugedir, dir, len); retval = 0; break; } @@ -252,7 +265,7 @@ get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) else { uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); if (pagesz == hugepage_sz) { - strlcpy(hugedir, splitstr[MOUNTPT], len); + strlcpy(hugedir, dir, len); retval = 0; break; } -- 2.25.1
Re: [dpdk-dev] [RFC] bus/auxiliary: introduce auxiliary bus
> -Original Message- > From: Stephen Hemminger > Sent: Friday, June 25, 2021 12:34 PM > To: Xueming(Steven) Li > Cc: NBU-Contact-Thomas Monjalon ; dev@dpdk.org; Parav > Pandit ; Ray Kinsella > ; Neil Horman > Subject: Re: [dpdk-dev] [RFC] bus/auxiliary: introduce auxiliary bus > > On Thu, 11 Mar 2021 21:01:13 +0800 > Xueming Li wrote: > > > + AUXILIAR_LOG(DEBUG, "Auxiliary device %s on NUMA socket %i\n", > > + dev->name, dev->device.numa_node); > > Your log messages will be double spaced. > The macro is already adding a new line. Thanks, update in next version. > > > + > > +#define AUXILIAR_LOG(level, fmt, args...) \ > > + rte_log(RTE_LOG_ ## level, auxiliary_logtype_bus, "%s(): " fmt "\n", \ > > + __func__, ##args)
Re: [dpdk-dev] [PATCH v6 4/7] vhost: fix NUMA reallocation with multiqueue
Hi Maxime, > -Original Message- > From: stable On Behalf Of Xia, Chenbo > Sent: Friday, June 25, 2021 10:56 AM > To: Maxime Coquelin ; dev@dpdk.org; > david.march...@redhat.com > Cc: sta...@dpdk.org > Subject: Re: [dpdk-stable] [PATCH v6 4/7] vhost: fix NUMA reallocation > with multiqueue > > Hi Maxime, > > > -Original Message- > > From: Maxime Coquelin > > Sent: Friday, June 18, 2021 10:04 PM > > To: dev@dpdk.org; david.march...@redhat.com; Xia, Chenbo > > > Cc: Maxime Coquelin ; sta...@dpdk.org > > Subject: [PATCH v6 4/7] vhost: fix NUMA reallocation with multiqueue > > > > Since the Vhost-user device initialization has been reworked, > > enabling the application to start using the device as soon as > > the first queue pair is ready, NUMA reallocation no more > > happened on queue pairs other than the first one since > > numa_realloc() was returning early if the device was running. > > > > This patch fixes this issue by only preventing the device > > metadata to be allocated if the device is running. For the > > virtqueues, a vring state change notification is sent to > > notify the application of its disablement. Since the callback > > is supposed to be blocking, it is safe to reallocate it > > afterwards. > > > > Fixes: d0fcc38f5fa4 ("vhost: improve device readiness notifications") > > Cc: sta...@dpdk.org > > > > Signed-off-by: Maxime Coquelin > > --- > > lib/vhost/vhost_user.c | 13 ++--- > > 1 file changed, 10 insertions(+), 3 deletions(-) > > > > diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c > > index 82adf80fe5..51b96a0716 100644 > > --- a/lib/vhost/vhost_user.c > > +++ b/lib/vhost/vhost_user.c > > @@ -488,12 +488,16 @@ numa_realloc(struct virtio_net *dev, int index) > > struct batch_copy_elem *new_batch_copy_elems; > > int ret; > > > > - if (dev->flags & VIRTIO_DEV_RUNNING) > > - return dev; > > - > > old_dev = dev; > > vq = old_vq = dev->virtqueue[index]; > > > > + /* > > +* If VQ is ready, it is too late to reallocate, it certainly > already > > +* happened anyway on VHOST_USER_SET_VRING_ADRR. > > +*/ > > + if (vq->ready) > > + return dev; > > + > > ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, > > MPOL_F_NODE | MPOL_F_ADDR); > > > > @@ -558,6 +562,9 @@ numa_realloc(struct virtio_net *dev, int index) > > rte_free(old_vq); > > } > > > > + if (dev->flags & VIRTIO_DEV_RUNNING) > > + goto out; > > + > > Since we don't realloc when vq is ready, there is no case that vq not > ready but > device still running, right? Sorry, I forgot DEV_RUNNING now only requires 1 qpair ready now ☹ Ignore above comments.. Thanks, Chenbo > > Thanks, > Chenbo > > > /* check if we need to reallocate dev */ > > ret = get_mempolicy(&oldnode, NULL, 0, old_dev, > > MPOL_F_NODE | MPOL_F_ADDR); > > -- > > 2.31.1
[dpdk-dev] [PATCH v6 1/2] devargs: add common key definition
Adds common devargs key definition for "bus", "class" and "driver". Acked-by: Thomas Monjalon Signed-off-by: Xueming Li --- drivers/common/mlx5/mlx5_common.h | 2 -- drivers/common/mlx5/mlx5_common_pci.c | 2 +- drivers/common/sfc_efx/sfc_efx.c| 7 +++ drivers/common/sfc_efx/sfc_efx.h| 2 -- drivers/net/bonding/rte_eth_bond_args.c | 2 +- drivers/net/i40e/i40e_ethdev_vf.c | 5 ++--- drivers/net/iavf/iavf_ethdev.c | 5 ++--- drivers/net/mlx5/mlx5.c | 4 ++-- drivers/net/sfc/sfc_kvargs.c| 2 +- drivers/vdpa/mlx5/mlx5_vdpa.c | 2 +- lib/eal/common/eal_common_devargs.c | 12 ++-- lib/eal/include/rte_devargs.h | 24 12 files changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/common/mlx5/mlx5_common.h b/drivers/common/mlx5/mlx5_common.h index 1fbefe0fa6..306f2f1ab7 100644 --- a/drivers/common/mlx5/mlx5_common.h +++ b/drivers/common/mlx5/mlx5_common.h @@ -208,8 +208,6 @@ __rte_internal int mlx5_get_ifname_sysfs(const char *ibdev_path, char *ifname); -#define MLX5_CLASS_ARG_NAME "class" - enum mlx5_class { MLX5_CLASS_INVALID, MLX5_CLASS_NET = RTE_BIT64(0), diff --git a/drivers/common/mlx5/mlx5_common_pci.c b/drivers/common/mlx5/mlx5_common_pci.c index 3f16cd21cf..34747c4e07 100644 --- a/drivers/common/mlx5/mlx5_common_pci.c +++ b/drivers/common/mlx5/mlx5_common_pci.c @@ -118,7 +118,7 @@ bus_cmdline_options_handler(__rte_unused const char *key, static int parse_class_options(const struct rte_devargs *devargs) { - const char *key = MLX5_CLASS_ARG_NAME; + const char *key = RTE_DEVARGS_KEY_CLASS; struct rte_kvargs *kvlist; int ret = 0; diff --git a/drivers/common/sfc_efx/sfc_efx.c b/drivers/common/sfc_efx/sfc_efx.c index 0b78933d9f..2dc5545760 100644 --- a/drivers/common/sfc_efx/sfc_efx.c +++ b/drivers/common/sfc_efx/sfc_efx.c @@ -42,7 +42,6 @@ enum sfc_efx_dev_class sfc_efx_dev_class_get(struct rte_devargs *devargs) { struct rte_kvargs *kvargs; - const char *key = SFC_EFX_KVARG_DEV_CLASS; enum sfc_efx_dev_class dev_class = SFC_EFX_DEV_CLASS_NET; if (devargs == NULL) @@ -52,9 +51,9 @@ sfc_efx_dev_class_get(struct rte_devargs *devargs) if (kvargs == NULL) return dev_class; - if (rte_kvargs_count(kvargs, key) != 0) { - rte_kvargs_process(kvargs, key, sfc_efx_kvarg_dev_class_handler, - &dev_class); + if (rte_kvargs_count(kvargs, RTE_DEVARGS_KEY_CLASS) != 0) { + rte_kvargs_process(kvargs, RTE_DEVARGS_KEY_CLASS, + sfc_efx_kvarg_dev_class_handler, &dev_class); } rte_kvargs_free(kvargs); diff --git a/drivers/common/sfc_efx/sfc_efx.h b/drivers/common/sfc_efx/sfc_efx.h index 6b6164cb1f..c16eca60f3 100644 --- a/drivers/common/sfc_efx/sfc_efx.h +++ b/drivers/common/sfc_efx/sfc_efx.h @@ -19,8 +19,6 @@ extern "C" { #endif -#define SFC_EFX_KVARG_DEV_CLASS"class" - enum sfc_efx_dev_class { SFC_EFX_DEV_CLASS_INVALID = 0, SFC_EFX_DEV_CLASS_NET, diff --git a/drivers/net/bonding/rte_eth_bond_args.c b/drivers/net/bonding/rte_eth_bond_args.c index 764b1b8c8e..5406e1c934 100644 --- a/drivers/net/bonding/rte_eth_bond_args.c +++ b/drivers/net/bonding/rte_eth_bond_args.c @@ -18,7 +18,7 @@ const char *pmd_bond_init_valid_arguments[] = { PMD_BOND_SOCKET_ID_KVARG, PMD_BOND_MAC_ADDR_KVARG, PMD_BOND_AGG_MODE_KVARG, - "driver", + RTE_DEVARGS_KEY_DRIVER, NULL }; diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 385ebedcd3..0cfe13b7b2 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -1660,7 +1660,6 @@ static int i40evf_driver_selected(struct rte_devargs *devargs) { struct rte_kvargs *kvlist; - const char *key = "driver"; int ret = 0; if (devargs == NULL) @@ -1670,13 +1669,13 @@ i40evf_driver_selected(struct rte_devargs *devargs) if (kvlist == NULL) return 0; - if (!rte_kvargs_count(kvlist, key)) + if (!rte_kvargs_count(kvlist, RTE_DEVARGS_KEY_DRIVER)) goto exit; /* i40evf driver selected when there's a key-value pair: * driver=i40evf */ - if (rte_kvargs_process(kvlist, key, + if (rte_kvargs_process(kvlist, RTE_DEVARGS_KEY_DRIVER, i40evf_check_driver_handler, NULL) < 0) goto exit; diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c index 5290588b17..472538181e 100644 --- a/drivers/net/iavf/iavf_ethdev.c +++ b/drivers/net/iavf/iavf_ethdev.c @@ -2448,7 +2448,6 @@ static int iavf_drv_i40evf_selected(struct rte_devargs *devargs, uint16_t device_id) { struct rte_kvargs *kvlist; - const char *key =
[dpdk-dev] [PATCH v6 2/2] bus/auxiliary: introduce auxiliary bus
Auxiliary bus [1] provides a way to split function into child-devices representing sub-domains of functionality. Each auxiliary device represents a part of its parent functionality. Auxiliary device is identified by unique device name, sysfs path: /sys/bus/auxiliary/devices/ Devargs legacy syntax ofauxiliary device: -a auxiliary:[,args...] Devargs generic syntax of auxiliary device: -a bus=auxiliary,name=,,/class=,,/driver=,, [1] kernel auxiliary bus document: https://www.kernel.org/doc/html/latest/driver-api/auxiliary_bus.html Signed-off-by: Xueming Li Cc: Wang Haiyue Cc: Thomas Monjalon Cc: Kinsella Ray --- MAINTAINERS | 5 + doc/guides/rel_notes/release_21_08.rst| 6 + drivers/bus/auxiliary/auxiliary_common.c | 411 ++ drivers/bus/auxiliary/auxiliary_params.c | 59 drivers/bus/auxiliary/linux/auxiliary.c | 141 drivers/bus/auxiliary/meson.build | 16 + drivers/bus/auxiliary/private.h | 74 drivers/bus/auxiliary/rte_bus_auxiliary.h | 201 +++ drivers/bus/auxiliary/version.map | 7 + drivers/bus/meson.build | 1 + 10 files changed, 921 insertions(+) create mode 100644 drivers/bus/auxiliary/auxiliary_common.c create mode 100644 drivers/bus/auxiliary/auxiliary_params.c create mode 100644 drivers/bus/auxiliary/linux/auxiliary.c create mode 100644 drivers/bus/auxiliary/meson.build create mode 100644 drivers/bus/auxiliary/private.h create mode 100644 drivers/bus/auxiliary/rte_bus_auxiliary.h create mode 100644 drivers/bus/auxiliary/version.map diff --git a/MAINTAINERS b/MAINTAINERS index 5877a16971..eaf691ca6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -525,6 +525,11 @@ F: doc/guides/mempool/octeontx2.rst Bus Drivers --- +Auxiliary bus driver +M: Parav Pandit +M: Xueming Li +F: drivers/bus/auxiliary/ + Intel FPGA bus M: Rosen Xu F: drivers/bus/ifpga/ diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..e7ef4c8a05 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,12 @@ New Features Also, make sure to start the actual text at the margin. === +* **Added auxiliary bus support.** + + Auxiliary bus provides a way to split function into child-devices + representing sub-domains of functionality. Each auxiliary device + represents a part of its parent functionality. + Removed Items - diff --git a/drivers/bus/auxiliary/auxiliary_common.c b/drivers/bus/auxiliary/auxiliary_common.c new file mode 100644 index 00..8a75306da5 --- /dev/null +++ b/drivers/bus/auxiliary/auxiliary_common.c @@ -0,0 +1,411 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2021 NVIDIA Corporation & Affiliates + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "private.h" +#include "rte_bus_auxiliary.h" + +static struct rte_devargs * +auxiliary_devargs_lookup(const char *name) +{ + struct rte_devargs *devargs; + + RTE_EAL_DEVARGS_FOREACH(RTE_BUS_AUXILIARY_NAME, devargs) { + if (strcmp(devargs->name, name) == 0) + return devargs; + } + return NULL; +} + +/* + * Test whether the auxiliary device exist + * + * Stub for OS not supporting auxiliary bus. + */ +__rte_weak bool +auxiliary_dev_exists(const char *name) +{ + RTE_SET_USED(name); + return false; +} + +/* + * Scan the devices in the auxiliary bus. + * + * Stub for OS not supporting auxiliary bus. + */ +__rte_weak int +auxiliary_scan(void) +{ + return 0; +} + +/* + * Update a device's devargs being scanned. + * + * @param aux_dev + * AUXILIARY device. + */ +void +auxiliary_on_scan(struct rte_auxiliary_device *aux_dev) +{ + aux_dev->device.devargs = auxiliary_devargs_lookup(aux_dev->name); +} + +/* + * Match the auxiliary driver and device using driver function. + */ +bool +auxiliary_match(const struct rte_auxiliary_driver *aux_drv, + const struct rte_auxiliary_device *aux_dev) +{ + if (aux_drv->match == NULL) + return false; + return aux_drv->match(aux_dev->name); +} + +/* + * Call the probe() function of the driver. + */ +static int +rte_auxiliary_probe_one_driver(struct rte_auxiliary_driver *drv, + struct rte_auxiliary_device *dev) +{ + enum rte_iova_mode iova_mode; + int ret; + + if ((drv == NULL) || (dev == NULL)) + return -EINVAL; + + /* Check if driver supports it. */ + if (!auxiliary_match(drv, dev)) + /* Match of device and driver failed */ + return 1; + + /* No initialization when marked as bl
Re: [dpdk-dev] [PATCH v1 4/7] power: remove thread safety from PMD power API's
On 23-Jun-21 10:52 AM, Ananyev, Konstantin wrote: On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote: Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. I think you need to update RN too with that. Yep, will fix. Another thing - do you really need the whole port stopped? From what I understand - you work on queues, so it is enough for you that related RX queue is stopped. So, to make things a bit more robust, in pmgmt_queue_enable/disable you can call rte_eth_rx_queue_info_get() and check queue state. We work on queues, but the data is per-lcore not per-queue, and it is potentially used by multiple queues, so checking one specific queue is not going to be enough. We could check all queues that were registered so far with the power library, maybe that'll work better? Yep, that's what I mean: on queue_enable() check is that queue stopped or not. If not, return -EBUSY/EAGAIN or so/ Sorry if I wasn't clear at first time. I think it's still better that all queues are stopped, rather than trying to work around the inherently racy implementation. So while i'll add the queue stopped checks, i'll still remove all of the thread safety stuff from here. -- Thanks, Anatoly
Re: [dpdk-dev] [PATCH v5 2/2] bus/auxiliary: introduce auxiliary bus
25/06/2021 05:26, Xueming(Steven) Li: > From: Thomas Monjalon > > 23/06/2021 02:03, Xueming Li: > > > +static int > > > +auxiliary_parse(const char *name, void *addr) { > > > + struct rte_auxiliary_driver *drv = NULL; > > > + const char **out = addr; > > > + > > > + /* Allow dummy name to prevent bus scan. */ > > > + if (strlen(name) == 0) > > > + return 0; > > > > Which syntax is it? > > Allow empty device name "auxiliary:" to bypass entire auxiliary bus scan. Ah OK A suggestion of comment: /* Skip auxiliary bus scan if name is empty. */
Re: [dpdk-dev] [RFC PATCH] ethdev: clarify flow action PORT ID semantics
On 6/2/2021 1:46 PM, Ilya Maximets wrote: > On 6/1/21 4:28 PM, Ivan Malov wrote: >> Hi Ilya, >> >> Thank you for reviewing the proposal at such short notice. I'm afraid that >> prior discussions overlook the simple fact that the whole problem is not >> limited to just VF representors. Action PORT_ID is also used with respect to >> the admin PF's ethdev, which "represents itself" (and by no means it >> represents the underlying physical/network port). In this case, one cannot >> state that the application treats it as a physical port, just like one >> states that the application perceives representors as VFs themselves. > > > I don't think that it was overlooked. If device is in a switchdev mode than > there is a PF representor and VF representors. Application typically works > only with representors in this case is it doesn't make much sense to have > representor and the upstream port attached to the same application at the > same time. Configuration that is applied by application to the representor > (PF or VF, it doesn't matter) applies to the corresponding upstream port > (actual PF or VF) by default. > > Exactly same thing here with PORT_ID action. You have a packet and action > to send it to the port, but it's not specified if HW needs to send it to > the representor or the upstream port (again, VF or PF, it doesn't matter). > Since there is no extra information, HW should send it to the upstream > port by default. The same as configuration applies by default to the > upstream port. > > Let's look at some workflow examples: > > DPDK Application > | | > | | >+--PF-rep--VF-rep---+ >| | >|NIC (switchdev)| >| | >+---PF-VF---+ >| | >| | >External VM or whatever >Network > > a. Workflow for "DPDK Application" to set MAC to VF: > > 1. "DPDK Application" calls rte_set_etheraddr("VF-rep", new_mac); > 2. DPDK sets MAC for "VF". > > b. Workflow for "DPDK Application" to set MAC to PF: > > 1. "DPDK Application" calls rte_set_etheraddr("PF-rep", new_mac); > 2. DPDK sets MAC for "PF". > > c. Workflow for "DPDK Application" to send packet to the external network: > > 1. "DPDK Application" calls rte_eth_tx_burst("PF-rep", packet); > 2. NIC receives the packet from "PF-rep" and sends it to "PF". > 3. packet egresses to the external network from "PF". > > d. Workflow for "DPDK Application" to send packet to the "VM or whatever": > > 1. "DPDK Application" calls rte_eth_tx_burst("VF-rep", packet); > 2. NIC receives the packet from "VF-rep" and sends it to "VF". > 3. "VM or whatever" receives the packet from "VF". > > In two workflows above there is no rte_flow processing on step 2, i.e., > NIC does not perform any lookups/matches/actions, because it's not possible > to configure actions for packets received from "PF-rep" or > "VF-rep" as these ports doesn't own a port id and all the configuration > and rte_flow actions translated and applied for the devices that these > ports represents ("PF" and "VF") and not representors themselves ("PF-rep" > or "VF-rep"). > > e. Workflow for the packet received on PF and PORT_ID action: > > 1. "DPDK Application" configures rte_flow for all packets from "PF-rep" >to execute PORT_ID "VF-rep". > 2. NIC receives packet on "PF". > 3. NIC executes 'PORT_ID "VF-rep"' action by sending packet to "VF". > 4. "VM or whatever" receives the packet from "VF". > > f. Workflow for the packet received on VF and PORT_ID action: > > 1. "DPDK Application" configures rte_flow for all packets from "VF-rep" >to execute 'PORT_ID "PF-rep"'. > 2. NIC receives packet on "VF". > 3. NIC executes 'PORT_ID "PF-rep"' action by sending packet to "PF". > 4. Packet egresses from the "PF" to the external network. > > Above is what, IMHO, the logic should look like and this matches with > the overall switchdev design in kernel. > Hi Ilya, Thanks for clearly explaining the usecase, this was useful (at least for me). But I am still not clear what is the other usecase, when port_id action is for 'VF-rep' packets sent to 'VF-ref' (instead of VF). I remember Ilya mentioned both 'VF-rep' & 'VF' can be attached to an application for debug purposes, but not any real life usage mentioned, unless I missed. And if represontor datapath works independently, instead of being a pipe/wire to represented port, won't it be a virtual partition of the port, instead of representor of the port? > I understand that this logic could seem flipped-over from the HW point > of view, but it's perfectly logical from the user's perspective, because > user should not care if the application works with representors or > some real devices. If application configures that all packets from port > A should be sent to port B, user will expect that these packets will > egress from port B once received from port A. That will be highly > inco
Re: [dpdk-dev] Experimental symbols in kni lib
Hi Ferruh, all, Let's please discuss another approach to setting KNI link status before making this API stable: http://patches.dpdk.org/project/dpdk/patch/20190925093623.18419-1-iryz...@nfware.com/ I explained the problem with the current implementation there. More than that, using ioctl approach makes it possible to set also speed and duplex and use them to implement get_link_ksettings callback. I can send patches for both features. Igor On Thu, Jun 24, 2021 at 4:54 PM Kinsella, Ray wrote: > Sounds more than reasonable, +1 from me. > > Ray K > > On 24/06/2021 14:24, Ferruh Yigit wrote: > > On 6/24/2021 11:42 AM, Kinsella, Ray wrote: > >> Hi Ferruh, > >> > >> The following kni experimental symbols are present in both v21.05 and > v19.11 release. These symbols should be considered for promotion to stable > as part of the v22 ABI in DPDK 21.11, as they have been experimental for >= > 2yrs at this point. > >> > >> * rte_kni_update_link > >> > >> Ray K > >> > > > > Hi Ray, > > > > Thanks for follow up. > > > > I just checked the API and planning a small behavior update to it. > > If the update is accepted, I suggest keeping the API experimental for > 21.08 too, > > but can mature it on v21.11. > > > > Thanks, > > ferruh > > >
[dpdk-dev] [PATCH v2 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/prog_guide/power_man.rst | 83 ++- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 16 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + drivers/net/i40e/i40e_rxtx.c | 19 +- drivers/net/iavf/iavf_rxtx.c | 19 +- drivers/net/ice/ice_rxtx.c| 19 +- drivers/net/ixgbe/ixgbe_rxtx.c| 19 +- drivers/net/mlx5/mlx5_rx.c| 16 +- examples/l3fwd-power/main.c | 39 +- lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 64 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 78 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 574 +- lib/power/rte_power_pmd_mgmt.h| 40 ++ lib/power/version.map | 3 + 21 files changed, 841 insertions(+), 224 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v2 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 16 -- drivers/net/i40e/i40e_rxtx.c | 19 drivers/net/iavf/iavf_rxtx.c | 19 drivers/net/ice/ice_rxtx.c| 19 drivers/net/ixgbe/ixgbe_rxtx.c| 19 drivers/net/mlx5/mlx5_rx.c| 16 -- .../include/generic/rte_power_intrinsics.h| 29 ++- lib/eal/x86/rte_power_intrinsics.c| 9 ++ 9 files changed, 106 insertions(+), 41 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..14dfac257c 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,15 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, const uint64_t opaque[4]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3203,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..45f3fbf4ec 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,17 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +104,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..6e12ecce07 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -57,6 +57,17 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type) rxdid_map[flex_type] : IAVF_RXDID_COMMS_OVS_1; } +static int +iavf_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << IAVF_RX_DESC_STAT
[dpdk-dev] [PATCH v2 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + 1 file changed, 33 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..8b9c89c3e8 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,37 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, const uint64_t opaque[4]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1480,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v2 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 ++ lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 69 +++ 8 files changed, 135 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index 046667ade6..877fb282cb 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -124,4 +124,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_times
[dpdk-dev] [PATCH v2 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id, uint16_t queue_id, enum rte_power_pmd_mgmt_type mode) { struct pmd_queue_cfg *queue_cfg; struct rte_eth_dev_
[dpdk-dev] [PATCH v2 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 76 - 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index 38f876466a..defb61bdc4 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet devices support monitoring, even if the underlying platform may support the necessary CPU instructions. Support for monitoring is diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index 7762cd39b8..aab2d4f1ee 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q) return 0; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + struct rte_power_monitor_cond *cur = &pmc[i]; + const union queue *q = &qle->queue; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -183,6 +201,48 @@ calc_tsc(void) } } +static uint16_t +clb_multiwait(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *addr __rte_unused) +{ + const unsigned int lcore = rte_lcore_id(); + const union queue q = {.portid = port_id, .qid = qidx}; + const bool empty = nb_rx == 0; + struct pmd_core_cfg *q_conf; + + q_conf = &lcore_cfg[lcore]; + + /* early exit */ + if (likely(!empty)) { + q_conf->empty_poll_stats = 0; + } else { + /* do we care about this particular queue? */ + if (!queue_is_power_save(q_conf, &q)) + return nb_rx; + + /* +* we can increment unconditionally here because if there were +* non-empty polls in other queues assigned to this core, we +* dropped the counter to zero anyway. +*/ + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS]; + uint16_t ret; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(q_conf, pmc); + + if (ret == 0) + rte_power_monitor_multi(pmc, + q_conf->n_queues, UINT64_MAX); + } + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, @@ -348,14 +408,19 @@ static int check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) { struct rte_power_monitor_cond dummy; + bool multimonitor_supported; /* check if rte_power_monitor is supported */ if (!global_data.intrinsics_support.power_monitor) { RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n"); return -ENOTSUP; } + /* check if multi-monitor is supported */ + multimonito
[dpdk-dev] [PATCH v2 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of cores to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when a special designated "power saving" queue is polled. To put it another way, we have no idea which queue the user will poll in what order, so we rely on them telling us that queue X is the last one in the polling loop, so any power management should happen there. - A new API is added to mark a specific Rx queue as "power saving". Failing to call this API will result in no power management, however when having only one queue per core it is obvious which queue is the "power saving" one, so things will still work without this new API for use cases that were previously working without it. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Signed-off-by: Anatoly Burakov --- Notes: v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/prog_guide/power_man.rst| 80 -- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 381 - lib/power/rte_power_pmd_mgmt.h | 34 +++ lib/power/version.map | 3 + 5 files changed, 407 insertions(+), 94 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..38f876466a 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,48 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power saving whenever empty poll count reaches a certain number. + +* Monitor + This power saving scheme will put the CPU into optimized power state and + monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever + there's new traffic. Support for this scheme may not be available on all + platforms, and further limitations may apply (see below). + +* Pause + This power saving scheme will avoid busy polling by either entering + power-optimized sleep state with ``rte_power_pause()`` function, or, if it's + not supported by the underlying platform, use ``rte_pause()``. + +* Frequency scaling + This power saving scheme will use ``librte_power`` library functionality to + scale the core frequency up/down depending on traffic volume. + +The "monitor" mode is only supported in the following configurations and scenarios: + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor()`` is supported by the platform, then monitoring will be + limited to a mapping of 1 core 1 queue (thus, each Rx
[dpdk-dev] [PATCH v2 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation, and always mark the last queue in qconf as the power save queue. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 39 +++-- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..3057c06936 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode) } } +static void +pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last) +{ + int ret; + + ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid, + qid, pmgmt_type); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", + ret, portid); + + if (!last) + return; + ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, port=%d\n", + ret, portid); +} + int main(int argc, char **argv) { @@ -2723,12 +2744,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; @@ -2767,15 +2782,9 @@ main(int argc, char **argv) "Fail to add ptype cb\n"); } - if (app_mode == APP_MODE_PMD_MGMT) { - ret = rte_power_ethdev_pmgmt_queue_enable( - lcore_id, portid, queueid, - pmgmt_type); - if (ret < 0) - rte_exit(EXIT_FAILURE, - "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", - ret, portid); - } + if (app_mode == APP_MODE_PMD_MGMT) + pmd_pmgmt_set_up(lcore_id, portid, queueid, + queue == (qconf->n_rx_queue - 1)); } } -- 2.25.1
Re: [dpdk-dev] [PATCH v4 00/62] Marvell CNXK Ethdev Driver
On Wed, Jun 23, 2021 at 10:17 AM Nithin Dabilpuram wrote: > > This patchset adds support for Marvell CN106XX SoC based on 'common/cnxk' > driver. In future, CN9K a.k.a octeontx2 will also be supported by same > driver when code is ready and 'net/octeontx2' will be deprecated. Series applied to dpdk-next-net-mrvl/for-next-net. Thanks. Change the state to "Awaiting Upstream" for the main tree. > > Harman Kalra (1): > common/cnxk: allocate lmt region in userspace > > Jerin Jacob (7): > common/cnxk: fix batch alloc completion poll logic > net/cnxk: add Rx burst for cn9k > net/cnxk: add Rx vector version for cn9k > net/cnxk: add Tx burst for cn9k > net/cnxk: add Rx burst for cn10k > net/cnxk: add Rx vector version for cn10k > net/cnxk: add Tx burst for cn10k > > Kiran Kumar K (2): > net/cnxk: add support to configure npc > net/cnxk: support initial version of rte flow > > Nithin Dabilpuram (18): > common/cnxk: change model API to not use camel case > net/cnxk: add build infra and common probe > net/cnxk: add platform specific probe and remove > net/cnxk: add common devargs parsing function > net/cnxk: support common dev infos get > net/cnxk: add device configuration operation > net/cnxk: support link status update > net/cnxk: add Rx queue setup and release > net/cnxk: add Tx queue setup and release > net/cnxk: support packet type > net/cnxk: support queue start and stop > net/cnxk: add Rx multi-segmented version for cn9k > net/cnxk: add Tx multi-segment version for cn9k > net/cnxk: add Tx vector version for cn9k > net/cnxk: add Rx multi-segment version for cn10k > net/cnxk: add Tx multi-segment version for cn10k > net/cnxk: add Tx vector version for cn10k > net/cnxk: add device start and stop operations > > Satha Rao (8): > common/cnxk: add support to lock NIX RQ contexts > common/cnxk: add provision to enable RED on RQ > net/cnxk: add port/queue stats > net/cnxk: add xstats apis > net/cnxk: add rxq/txq info get operations > net/cnxk: add ethdev firmware version get > net/cnxk: add get register operation > net/cnxk: added RETA and RSS hash operations > > Satheesh Paul (6): > common/cnxk: add support to dump flow entries > common/cnxk: support for mark and flag flow actions > common/cnxk: support for VLAN push and pop flow actions > net/cnxk: add flow ops get operation > net/cnxk: support for RSS in rte flow > net/cnxk: support marking and VLAN tagging > > Sunil Kumar Kori (20): > net/cnxk: add MAC address set ops > net/cnxk: add MTU set device operation > net/cnxk: add promiscuous mode enable and disable > net/cnxk: support DMAC filter > net/cnxk: add all multicast enable/disable ethops > net/cnxk: add Rx/Tx burst mode get ops > net/cnxk: add flow ctrl set/get ops > net/cnxk: add link up/down operations > net/cnxk: add EEPROM module info get operations > net/cnxk: add Rx queue interrupt enable/disable ops > net/cnxk: add validation API for mempool ops > net/cnxk: add device close and reset operations > net/cnxk: add pending Tx mbuf cleanup operation > net/cnxk: register callback to get PTP status > net/cnxk: support base PTP timesync > net/cnxk: add timesync enable/disable operations > net/cnxk: add Rx/Tx timestamp read operations > net/cnxk: add time read/write/adjust operations > net/cnxk: add read clock operation > net/cnxk: support multicast filter > > -- > > v4: > - Fixed build issue with gcc 4.8 > - Shortened subject lines of few commits > - Removed camel case for model API > - Updated rte_flow features in cnxk_vec.ini and cnxk_vf.ini > - Added CC stable to "fix batch alloc.." patch > - Squashed cn98xx flow create related common patch to > VLAN push and pop flow actions patch. > - Changed INTERNAL to DPDK_21 in version.map > > v3: > - Updated release notes > - Removed RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS flag and add support for queue > stats in xstats > - Fixed issue with LSO format indices > - Removed mbox sync changes patch from this series > - Fixed documentation issues > - Removed repetitive code in fast path SIMD > - Optimize cn10k LMTST logic > - Make rte_flow_create implementation specific > to handle VLAN Stripping and MARK actions/offloads > - Use rte_atomic_thread_fence() instead of rte_rmb() > - Handle other comments from Jerin. > - Merged rte flow dump API patch to flow ops get patch > - Added marking and vlan tagging support. > - Fixed some checkpatch and git check log issues. > > v2: > - Fixed issue with flow validate and flow create for 98xx > - Fixed issue batch alloc logic > - Fix lmtline allocation to be cached > - Sync Inline IPSec Rx mbox with kernel > - Add support for mark and flag flow actions > - Add reta key and hash update ops > - Added PTP and multicast filter support > > MAINTAINERS |5 +- > doc/guides/nics/cnxk.rst| 232 + > doc/guides/nics/features/cnxk.ini | 90 ++ > doc/guides/n
[dpdk-dev] [PATCH] maintainers: update for ARM v8
From: Jerin Jacob Resigning my maintainership for ARM v8 architecture. Signed-off-by: Jerin Jacob --- Resigning due to not getting enough quality time to review arm64 architecture patches. Unlike those days where arm64 architecture started with Cavium HW, Now arm64 architecture is quite matured and at par with x86 support, and there is enough contribution from Arm. I will put my best effort into reviewing the arm64 architecture patches as time permits. --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5877a16971..7fee557343 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -268,7 +268,6 @@ F: lib/eal/arm/ X: lib/eal/arm/include/*_64.h ARM v8 -M: Jerin Jacob M: Ruifeng Wang F: config/arm/ F: doc/guides/linux_gsg/cross_build_dpdk_for_arm64.rst -- 2.32.0
Re: [dpdk-dev] [PATCH] net/octeontx2: support Inline IPsec without MBUF_FAST_FREE offload
On Wed, May 12, 2021 at 2:46 PM Tejasree Kondoj wrote: > > Adding support for Inline IPsec without DEV_TX_OFFLOAD_MBUF_FAST_FREE. > > Signed-off-by: Tejasree Kondoj Reviewed-by: Jerin Jacob Updated the git comments and Applied to dpdk-next-net-mrvl/for-next-net. Thanks > --- > drivers/net/octeontx2/otx2_ethdev_sec_tx.h | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/drivers/net/octeontx2/otx2_ethdev_sec_tx.h > b/drivers/net/octeontx2/otx2_ethdev_sec_tx.h > index c8eae3d628..623a2a841e 100644 > --- a/drivers/net/octeontx2/otx2_ethdev_sec_tx.h > +++ b/drivers/net/octeontx2/otx2_ethdev_sec_tx.h > @@ -59,8 +59,7 @@ otx2_sec_event_tx(uint64_t base, struct rte_event *ev, > struct rte_mbuf *m, > sa = &sess->out_sa; > > RTE_ASSERT(sess->cpt_lmtline != NULL); > - RTE_ASSERT(!(offload_flags & (NIX_TX_OFFLOAD_MBUF_NOFF_F | > - NIX_TX_OFFLOAD_VLAN_QINQ_F))); > + RTE_ASSERT(!(offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)); > > dlen = rte_pktmbuf_pkt_len(m) + sizeof(*hdr) - RTE_ETHER_HDR_LEN; > rlen = otx2_ipsec_fp_out_rlen_get(sess, dlen - sizeof(*hdr)); > @@ -135,6 +134,8 @@ otx2_sec_event_tx(uint64_t base, struct rte_event *ev, > struct rte_mbuf *m, > sd->nix_hdr.w0.sizem1 = 1; > sd->nix_hdr.w0.total = rte_pktmbuf_data_len(m); > sd->nix_hdr.w0.aura = npa_lf_aura_handle_to_aura(m->pool->pool_id); > + if (offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) > + sd->nix_hdr.w0.df = otx2_nix_prefree_seg(m); > > sd->nix_sg.u = 0; > sd->nix_sg.subdc = NIX_SUBDC_SG; > -- > 2.27.0 >
Re: [dpdk-dev] [PATCH v1 4/7] power: remove thread safety from PMD power API's
> >> > >> On 22-Jun-21 10:13 AM, Ananyev, Konstantin wrote: > >>> > Currently, we expect that only one callback can be active at any given > moment, for a particular queue configuration, which is relatively easy > to implement in a thread-safe way. However, we're about to add support > for multiple queues per lcore, which will greatly increase the > possibility of various race conditions. > > We could have used something like an RCU for this use case, but absent > of a pressing need for thread safety we'll go the easy way and just > mandate that the API's are to be called when all affected ports are > stopped, and document this limitation. This greatly simplifies the > `rte_power_monitor`-related code. > >>> > >>> I think you need to update RN too with that. > >> > >> Yep, will fix. > >> > >>> Another thing - do you really need the whole port stopped? > >>> From what I understand - you work on queues, so it is enough for you > >>> that related RX queue is stopped. > >>> So, to make things a bit more robust, in pmgmt_queue_enable/disable > >>> you can call rte_eth_rx_queue_info_get() and check queue state. > >> > >> We work on queues, but the data is per-lcore not per-queue, and it is > >> potentially used by multiple queues, so checking one specific queue is > >> not going to be enough. We could check all queues that were registered > >> so far with the power library, maybe that'll work better? > > > > Yep, that's what I mean: on queue_enable() check is that queue stopped or > > not. > > If not, return -EBUSY/EAGAIN or so/ > > Sorry if I wasn't clear at first time. > > I think it's still better that all queues are stopped, rather than > trying to work around the inherently racy implementation. So while i'll > add the queue stopped checks, i'll still remove all of the thread safety > stuff from here. That's fine by me, all I asked for here - an extra check to make sure the queue is really stopped.
Re: [dpdk-dev] [PATCH 3/4] net: introduce functions to verify L4 checksums
On 6/8/2021 1:39 PM, Andrew Rybchenko wrote: > On 6/8/21 3:29 PM, Olivier Matz wrote: >> Hi Ferruh, Andrew, >> >> On Tue, Jun 08, 2021 at 01:23:33PM +0300, Andrew Rybchenko wrote: >>> On 4/30/21 6:42 PM, Ferruh Yigit wrote: On 4/27/2021 2:57 PM, Olivier Matz wrote: > Since commit d5df2ae0428a ("net: fix unneeded replacement of TCP > checksum 0"), the functions rte_ipv4_udptcp_cksum() and > rte_ipv6_udptcp_cksum() can return either 0x or 0x when used to > verify a packet containing a valid checksum. > > Since these functions should be used to calculate the checksum to set in > a packet, introduce 2 new helpers for checksum verification. They return > 0 if the checksum is valid in the packet. > > Use this new helper in net/tap driver. > > Signed-off-by: Olivier Matz > --- > drivers/net/tap/rte_eth_tap.c | 7 +- > lib/net/rte_ip.h | 124 +++--- > 2 files changed, 104 insertions(+), 27 deletions(-) > > diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c > index 71282e8065..b14d5a1d55 100644 > --- a/drivers/net/tap/rte_eth_tap.c > +++ b/drivers/net/tap/rte_eth_tap.c > @@ -365,11 +365,12 @@ tap_verify_csum(struct rte_mbuf *mbuf) > return; > } > } > - cksum = rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr); > + cksum_ok = !rte_ipv4_udptcp_cksum_verify(l3_hdr, > + l4_hdr); > } else { /* l3 == RTE_PTYPE_L3_IPV6, checked above */ > - cksum = rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr); > + cksum_ok = !rte_ipv6_udptcp_cksum_verify(l3_hdr, > + l4_hdr); > } > - cksum_ok = (cksum == 0) || (cksum == 0x); > mbuf->ol_flags |= cksum_ok ? > PKT_RX_L4_CKSUM_GOOD : PKT_RX_L4_CKSUM_BAD; > } > diff --git a/lib/net/rte_ip.h b/lib/net/rte_ip.h > index 8c189009b0..ef84bcc5bf 100644 > --- a/lib/net/rte_ip.h > +++ b/lib/net/rte_ip.h > @@ -344,20 +344,10 @@ rte_ipv4_phdr_cksum(const struct rte_ipv4_hdr > *ipv4_hdr, uint64_t ol_flags) > } > > /** > - * Process the IPv4 UDP or TCP checksum. > - * > - * The IP and layer 4 checksum must be set to 0 in the packet by > - * the caller. > - * > - * @param ipv4_hdr > - * The pointer to the contiguous IPv4 header. > - * @param l4_hdr > - * The pointer to the beginning of the L4 header. > - * @return > - * The complemented checksum to set in the IP packet. > + * @internal Calculate the non-complemented IPv4 L4 checksum > */ > static inline uint16_t > -rte_ipv4_udptcp_cksum(const struct rte_ipv4_hdr *ipv4_hdr, const void > *l4_hdr) > +__rte_ipv4_udptcp_cksum(const struct rte_ipv4_hdr *ipv4_hdr, const void > *l4_hdr) > { > uint32_t cksum; > uint32_t l3_len, l4_len; > @@ -374,16 +364,62 @@ rte_ipv4_udptcp_cksum(const struct rte_ipv4_hdr > *ipv4_hdr, const void *l4_hdr) > cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); > > cksum = ((cksum & 0x) >> 16) + (cksum & 0x); > - cksum = (~cksum) & 0x; > + > + return (uint16_t)cksum; > +} > + > +/** > + * Process the IPv4 UDP or TCP checksum. > + * > + * The IP and layer 4 checksum must be set to 0 in the packet by > + * the caller. > + * > + * @param ipv4_hdr > + * The pointer to the contiguous IPv4 header. > + * @param l4_hdr > + * The pointer to the beginning of the L4 header. > + * @return > + * The complemented checksum to set in the IP packet. > + */ > +static inline uint16_t > +rte_ipv4_udptcp_cksum(const struct rte_ipv4_hdr *ipv4_hdr, const void > *l4_hdr) > +{ > + uint16_t cksum = __rte_ipv4_udptcp_cksum(ipv4_hdr, l4_hdr); > + > + cksum = ~cksum; > + > /* > - * Per RFC 768:If the computed checksum is zero for UDP, > + * Per RFC 768: If the computed checksum is zero for UDP, >* it is transmitted as all ones >* (the equivalent in one's complement arithmetic). >*/ > if (cksum == 0 && ipv4_hdr->next_proto_id == IPPROTO_UDP) > cksum = 0x; > > - return (uint16_t)cksum; > + return cksum; > +} > + > +/** > + * Validate the IPv4 UDP or TCP checksum. > + * > + * @param ipv4_hdr > + * The pointer to the contiguous IPv4 header. > + * @param l4_hdr > + * The pointer to the beginning of the L4 header. > + * @return > + * Return 0 if the checksum is correct, else -1. > + */ > +__rte_experimental > +static inline int >
[dpdk-dev] [PATCH 1/2] net/bnxt: add support for runtime queue setup
Add support for runtime Rx and Tx queue setup. This will allow Rx/Tx queue setup after the interface is started. Signed-off-by: Ajit Khaparde --- drivers/net/bnxt/bnxt_ethdev.c | 2 ++ drivers/net/bnxt/bnxt_hwrm.c | 46 -- drivers/net/bnxt/bnxt_hwrm.h | 3 ++ drivers/net/bnxt/bnxt_ring.c | 51 ++ drivers/net/bnxt/bnxt_rxq.c| 12 ++-- drivers/net/bnxt/bnxt_txq.c| 2 ++ drivers/net/bnxt/bnxt_txr.c| 6 7 files changed, 105 insertions(+), 17 deletions(-) diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c index 4d51a209f9..495c6cd21e 100644 --- a/drivers/net/bnxt/bnxt_ethdev.c +++ b/drivers/net/bnxt/bnxt_ethdev.c @@ -987,6 +987,8 @@ static int bnxt_dev_info_get_op(struct rte_eth_dev *eth_dev, dev_info->flow_type_rss_offloads = BNXT_ETH_RSS_SUPPORT; dev_info->speed_capa = bnxt_get_speed_capabilities(bp); + dev_info->dev_capa = RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP | +RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP; dev_info->default_rxconf = (struct rte_eth_rxconf) { .rx_thresh = { diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c index 6c4f83ee3b..1a4968abe6 100644 --- a/drivers/net/bnxt/bnxt_hwrm.c +++ b/drivers/net/bnxt/bnxt_hwrm.c @@ -1917,7 +1917,7 @@ int bnxt_hwrm_stat_clear(struct bnxt *bp, struct bnxt_cp_ring_info *cpr) return rc; } -static int bnxt_hwrm_stat_ctx_alloc(struct bnxt *bp, struct bnxt_cp_ring_info *cpr) +int bnxt_hwrm_stat_ctx_alloc(struct bnxt *bp, struct bnxt_cp_ring_info *cpr) { int rc; struct hwrm_stat_ctx_alloc_input req = {.req_type = 0 }; @@ -2637,10 +2637,11 @@ int bnxt_alloc_all_hwrm_stat_ctxs(struct bnxt *bp) cpr = rxq->cp_ring; } - rc = bnxt_hwrm_stat_ctx_alloc(bp, cpr); - - if (rc) - return rc; + if (cpr->hw_stats_ctx_id == HWRM_NA_SIGNATURE) { + rc = bnxt_hwrm_stat_ctx_alloc(bp, cpr); + if (rc) + return rc; + } } return rc; } @@ -2720,6 +2721,12 @@ void bnxt_free_hwrm_rx_ring(struct bnxt *bp, int queue_index) bp->grp_info[queue_index].ag_fw_ring_id = INVALID_HW_RING_ID; } + + if (cpr->hw_stats_ctx_id != HWRM_NA_SIGNATURE) { + bnxt_hwrm_stat_ctx_free(bp, cpr); + cpr->hw_stats_ctx_id = HWRM_NA_SIGNATURE; + } + if (cpr->cp_ring_struct->fw_ring_id != INVALID_HW_RING_ID) bnxt_free_cp_ring(bp, cpr); @@ -5093,7 +5100,6 @@ static int bnxt_vnic_rss_configure_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) { struct hwrm_vnic_rss_cfg_output *resp = bp->hwrm_cmd_resp_addr; - uint8_t *rx_queue_state = bp->eth_dev->data->rx_queue_state; struct hwrm_vnic_rss_cfg_input req = {.req_type = 0 }; struct bnxt_rx_queue **rxqs = bp->rx_queues; uint16_t *ring_tbl = vnic->rss_table; @@ -5127,8 +5133,7 @@ bnxt_vnic_rss_configure_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic) /* Find next active ring. */ for (cnt = 0; cnt < max_rings; cnt++) { - if (rx_queue_state[k] != - RTE_ETH_QUEUE_STATE_STOPPED) + if (rxqs[k]->rx_started) break; if (++k == max_rings) k = 0; @@ -6194,3 +6199,28 @@ int bnxt_hwrm_read_sfp_module_eeprom_info(struct bnxt *bp, uint16_t i2c_addr, return rc; } + +void bnxt_free_hwrm_tx_ring(struct bnxt *bp, int queue_index) +{ + struct bnxt_tx_queue *txq = bp->tx_queues[queue_index]; + struct bnxt_tx_ring_info *txr = txq->tx_ring; + struct bnxt_ring *ring = txr->tx_ring_struct; + struct bnxt_cp_ring_info *cpr = txq->cp_ring; + + if (ring->fw_ring_id != INVALID_HW_RING_ID) { + bnxt_hwrm_ring_free(bp, ring, + HWRM_RING_FREE_INPUT_RING_TYPE_TX, + cpr->cp_ring_struct->fw_ring_id); + ring->fw_ring_id = INVALID_HW_RING_ID; + } + + if (cpr->hw_stats_ctx_id != HWRM_NA_SIGNATURE) { + bnxt_hwrm_stat_ctx_free(bp, cpr); + cpr->hw_stats_ctx_id = HWRM_NA_SIGNATURE; + } + + if (cpr->cp_ring_struct->fw_ring_id != INVALID_HW_RING_ID) { + bnxt_free_cp_ring(bp, cpr); + cpr->cp_ring_struct->fw_ring_id = INVALID_HW_RING_ID; + } +} diff --git a/drivers/net/bnxt/bnxt_hwrm.h b/drivers/net/bnxt/bnxt_hwrm.h index 057f7f94d0..ec3414f0c6 100644 --- a/drivers/net/bnxt/bnxt_hwrm.h +++ b/drivers/
[dpdk-dev] [PATCH 0/2] bet/bnxt: add runtime queue setup support
This patchset adds runtime queue setup support. Ajit Khaparde (2): net/bnxt: add support for runtime queue setup net/bnxt: fix ring alloc and free logic drivers/net/bnxt/bnxt_ethdev.c | 2 + drivers/net/bnxt/bnxt_hwrm.c | 181 + drivers/net/bnxt/bnxt_hwrm.h | 3 + drivers/net/bnxt/bnxt_ring.c | 81 ++- drivers/net/bnxt/bnxt_rxq.c| 12 +-- drivers/net/bnxt/bnxt_txq.c| 2 + drivers/net/bnxt/bnxt_txr.c| 6 ++ 7 files changed, 164 insertions(+), 123 deletions(-) -- 2.21.1 (Apple Git-122.3)
[dpdk-dev] [PATCH 2/2] net/bnxt: fix ring alloc and free logic
Fix handling of ring alloc and free logic to fix check for invalid ring and context IDs. This also avoids code duplication. Fixes: 6133f207970c ("net/bnxt: add Rx queue create/destroy") Fixes: 51c87ebafc7d ("net/bnxt: add Tx queue create/destroy") Signed-off-by: Ajit Khaparde Reviewed-by: Somnath Kotur --- drivers/net/bnxt/bnxt_hwrm.c | 173 +++ drivers/net/bnxt/bnxt_ring.c | 30 +- 2 files changed, 78 insertions(+), 125 deletions(-) diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c index 1a4968abe6..4593991af8 100644 --- a/drivers/net/bnxt/bnxt_hwrm.c +++ b/drivers/net/bnxt/bnxt_hwrm.c @@ -1810,6 +1810,9 @@ int bnxt_hwrm_ring_free(struct bnxt *bp, struct hwrm_ring_free_input req = {.req_type = 0 }; struct hwrm_ring_free_output *resp = bp->hwrm_cmd_resp_addr; + if (ring->fw_ring_id == INVALID_HW_RING_ID) + return -EINVAL; + HWRM_PREP(&req, HWRM_RING_FREE, BNXT_USE_CHIMP_MB); req.ring_type = ring_type; @@ -1817,6 +1820,7 @@ int bnxt_hwrm_ring_free(struct bnxt *bp, req.cmpl_ring = rte_cpu_to_le_16(cp_ring_id); rc = bnxt_hwrm_send_message(bp, &req, sizeof(req), BNXT_USE_CHIMP_MB); + ring->fw_ring_id = INVALID_HW_RING_ID; if (rc || resp->error_code) { if (rc == 0 && resp->error_code) @@ -1902,7 +1906,7 @@ int bnxt_hwrm_stat_clear(struct bnxt *bp, struct bnxt_cp_ring_info *cpr) struct hwrm_stat_ctx_clr_stats_input req = {.req_type = 0 }; struct hwrm_stat_ctx_clr_stats_output *resp = bp->hwrm_cmd_resp_addr; - if (cpr->hw_stats_ctx_id == (uint32_t)HWRM_NA_SIGNATURE) + if (cpr->hw_stats_ctx_id == HWRM_NA_SIGNATURE) return rc; HWRM_PREP(&req, HWRM_STAT_CTX_CLR_STATS, BNXT_USE_CHIMP_MB); @@ -1923,6 +1927,9 @@ int bnxt_hwrm_stat_ctx_alloc(struct bnxt *bp, struct bnxt_cp_ring_info *cpr) struct hwrm_stat_ctx_alloc_input req = {.req_type = 0 }; struct hwrm_stat_ctx_alloc_output *resp = bp->hwrm_cmd_resp_addr; + if (cpr->hw_stats_ctx_id != HWRM_NA_SIGNATURE) + return 0; + HWRM_PREP(&req, HWRM_STAT_CTX_ALLOC, BNXT_USE_CHIMP_MB); req.update_period_ms = rte_cpu_to_le_32(0); @@ -1946,6 +1953,9 @@ static int bnxt_hwrm_stat_ctx_free(struct bnxt *bp, struct bnxt_cp_ring_info *cp struct hwrm_stat_ctx_free_input req = {.req_type = 0 }; struct hwrm_stat_ctx_free_output *resp = bp->hwrm_cmd_resp_addr; + if (cpr->hw_stats_ctx_id == HWRM_NA_SIGNATURE) + return 0; + HWRM_PREP(&req, HWRM_STAT_CTX_FREE, BNXT_USE_CHIMP_MB); req.stat_ctx_id = rte_cpu_to_le_32(cpr->hw_stats_ctx_id); @@ -1955,6 +1965,8 @@ static int bnxt_hwrm_stat_ctx_free(struct bnxt *bp, struct bnxt_cp_ring_info *cp HWRM_CHECK_RESULT(); HWRM_UNLOCK(); + cpr->hw_stats_ctx_id = HWRM_NA_SIGNATURE; + return rc; } @@ -2600,49 +2612,54 @@ bnxt_free_all_hwrm_stat_ctxs(struct bnxt *bp) unsigned int i; struct bnxt_cp_ring_info *cpr; - for (i = 0; i < bp->rx_cp_nr_rings + bp->tx_cp_nr_rings; i++) { + for (i = 0; i < bp->rx_cp_nr_rings; i++) { - if (i >= bp->rx_cp_nr_rings) { - cpr = bp->tx_queues[i - bp->rx_cp_nr_rings]->cp_ring; - } else { - cpr = bp->rx_queues[i]->cp_ring; - if (BNXT_HAS_RING_GRPS(bp)) - bp->grp_info[i].fw_stats_ctx = -1; - } - if (cpr->hw_stats_ctx_id != HWRM_NA_SIGNATURE) { - rc = bnxt_hwrm_stat_ctx_free(bp, cpr); - cpr->hw_stats_ctx_id = HWRM_NA_SIGNATURE; - if (rc) - return rc; - } + cpr = bp->rx_queues[i]->cp_ring; + if (BNXT_HAS_RING_GRPS(bp)) + bp->grp_info[i].fw_stats_ctx = -1; + rc = bnxt_hwrm_stat_ctx_free(bp, cpr); + if (rc) + return rc; + } + + for (i = 0; i < bp->tx_cp_nr_rings; i++) { + cpr = bp->tx_queues[i]->cp_ring; + rc = bnxt_hwrm_stat_ctx_free(bp, cpr); + if (rc) + return rc; } + return 0; } int bnxt_alloc_all_hwrm_stat_ctxs(struct bnxt *bp) { + struct bnxt_cp_ring_info *cpr; unsigned int i; int rc = 0; - for (i = 0; i < bp->rx_cp_nr_rings + bp->tx_cp_nr_rings; i++) { - struct bnxt_tx_queue *txq; - struct bnxt_rx_queue *rxq; - struct bnxt_cp_ring_info *cpr; + for (i = 0; i < bp->rx_cp_nr_rings; i++) { + struct bnxt_rx_queue *rxq = bp->rx_queues[i]; - if (i >= bp->rx_cp_nr_rings) { - txq = bp->tx_queues[i - bp->rx_cp_nr_rings]; - cpr = txq->c
[dpdk-dev] dmadev discussion summary
Hi, all I analyzed the current DPAM DMA driver and drew this summary in conjunction with the previous discussion, and this will as a basis for the V2 implementation. Feedback is welcome, thanks dpaa2_qdma: [probe]: mainly obtains the number of hardware queues. [dev_configure]: has following parameters: max_hw_queues_per_core: max_vqs: max number of virt-queue fle_queue_pool_cnt: the size of FLE pool [queue_setup]: setup up one virt-queue, has following parameters: lcore_id: flags: some control params, e.g. sg-list, longformat desc, exclusive HW queue... rbp: some misc field which impact the descriptor Note: this API return the index of virt-queue which was successful setuped. [enqueue_bufs]: data-plane API, the key fields: vq_id: the index of virt-queue job: the pointer of job array nb_jobs: Note: one job has src/dest/len/flag/cnxt/status/vq_id/use_elem fields, the flag field indicate whether src/dst is PHY addr. [dequeue_bufs]: get the completed jobs's pointer [key point]: |virt-queue||virt-queue| \ / \ / \ / | HW-queue | | HW-queue | \/ \ / \/ core/rawdev 1) In the probe stage, driver tell how many HW-queues could use. 2) User could specify the maximum number of HW-queues managed by a single core in the dev_configure stage. 3) User could create one virt-queue by queue_setup API, the virt-queue has two types: a) exclusive HW-queue, b) shared HW-queue(as described above), this is achieved by the corresponding bit of flags field. 4) In this mode, queue management is simplified. User do not need to specify the HW-queue to be applied for and create a virt-queue on the HW-queue. All you need to do is say on which core I want to create a virt-queue. 5) The virt-queue could have different capability, e.g. virt-queue-0 support scatter-gather format, and virt-queue-1 don't support sg, this was control by flags and rbp fields in queue_setup stage. 6) The data-plane API use the definition similar to rte_mbuf and rte_eth_rx/tx_burst(). PS: I still don't understand how sg-list enqueue/dequeue, and user how to use RTE_QDMA_VQ_NO_RESPONSE. Overall, I think it's a flexible design with many scalability. Especially the queue resource pool architecture, simplifies user invocations, although the 'core' introduces a bit abruptly. octeontx2_dma: [dev_configure]: has one parameters: chunk_pool: it's strange why it's not managed internally by the driver, but passed in through the API. [enqueue_bufs]: has three important parameters: context: this is what Jerin referred to 'channel', it could hold the completed ring of the job. buffers: hold the pointer array of dpi_dma_buf_ptr_s count: how many dpi_dma_buf_ptr_s Note: one dpi_dma_buf_ptr_s may has many src and dst pairs (it's scatter- gather list), and has one completed_ptr (when HW complete it will write one value to this ptr), current the completed_ptr pointer struct: struct dpi_dma_req_compl_s { uint64_t cdata; --driver init and HW update result to this. void (*compl_cb)(void *dev, void *arg); void *cb_data; }; [dequeue_bufs]: has two important parameters: context: driver will scan it's completed ring to get complete info. buffers: hold the pointer array of completed_ptr. [key point]: ------ | channel || channel | ------ \ / \ / \ / | HW-queue | | |rawdev| 1) User could create one channel by init context(dpi_dma_queue_ctx_s), this interface is not standardized and needs to be implemented by users. 2) Different channels can support different transmissions, e.g. one for inner m2m, and other for inbound copy. Overall, I think the 'channel' is similar the 'virt-queue' of dpaa2_qdma. The difference is that dpaa2_qdma supports multiple hardware queues. The 'channel' has following 1) A channel is an operable unit at the user level. User can create a channel for each transfer type, for e