[PATCH v2] common/idpf: refactor single queue Tx function

2023-09-04 Thread Simei Su
This patch replaces flex Tx descriptor with base Tx descriptor to align
with kernel driver practice.

Signed-off-by: Simei Su 
---
v2:
* Refine commit title and commit log.
* Remove redundant definition.
* Modify base mode context TSO descriptor.

 drivers/common/idpf/idpf_common_rxtx.c| 76 +--
 drivers/common/idpf/idpf_common_rxtx.h|  2 +-
 drivers/common/idpf/idpf_common_rxtx_avx512.c | 37 +
 drivers/net/idpf/idpf_rxtx.c  |  2 +-
 4 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/drivers/common/idpf/idpf_common_rxtx.c 
b/drivers/common/idpf/idpf_common_rxtx.c
index fc87e3e243..01a8685ea3 100644
--- a/drivers/common/idpf/idpf_common_rxtx.c
+++ b/drivers/common/idpf/idpf_common_rxtx.c
@@ -276,14 +276,14 @@ idpf_qc_single_tx_queue_reset(struct idpf_tx_queue *txq)
}
 
txe = txq->sw_ring;
-   size = sizeof(struct idpf_flex_tx_desc) * txq->nb_tx_desc;
+   size = sizeof(struct idpf_base_tx_desc) * txq->nb_tx_desc;
for (i = 0; i < size; i++)
((volatile char *)txq->tx_ring)[i] = 0;
 
prev = (uint16_t)(txq->nb_tx_desc - 1);
for (i = 0; i < txq->nb_tx_desc; i++) {
-   txq->tx_ring[i].qw1.cmd_dtype =
-   rte_cpu_to_le_16(IDPF_TX_DESC_DTYPE_DESC_DONE);
+   txq->tx_ring[i].qw1 =
+   rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE);
txe[i].mbuf =  NULL;
txe[i].last_id = i;
txe[prev].next_id = i;
@@ -823,6 +823,37 @@ idpf_calc_context_desc(uint64_t flags)
return 0;
 }
 
+/* set TSO context descriptor for single queue
+ */
+static inline void
+idpf_set_singleq_tso_ctx(struct rte_mbuf *mbuf,
+   union idpf_tx_offload tx_offload,
+   volatile struct idpf_base_tx_ctx_desc *ctx_desc)
+{
+   uint32_t tso_len;
+   uint8_t hdr_len;
+   uint64_t qw1;
+
+   if (tx_offload.l4_len == 0) {
+   TX_LOG(DEBUG, "L4 length set to 0");
+   return;
+   }
+
+   hdr_len = tx_offload.l2_len +
+   tx_offload.l3_len +
+   tx_offload.l4_len;
+   tso_len = mbuf->pkt_len - hdr_len;
+   qw1 = (uint64_t)IDPF_TX_DESC_DTYPE_CTX;
+
+   qw1 |= IDPF_TX_CTX_DESC_TSO << IDPF_TXD_CTX_QW1_CMD_S;
+   qw1 |= ((uint64_t)tso_len << IDPF_TXD_CTX_QW1_TSO_LEN_S) &
+   IDPF_TXD_CTX_QW1_TSO_LEN_M;
+   qw1 |= ((uint64_t)mbuf->tso_segsz << IDPF_TXD_CTX_QW1_MSS_S) &
+   IDPF_TXD_CTX_QW1_MSS_M;
+
+   ctx_desc->qw1 = rte_cpu_to_le_64(qw1);
+}
+
 /* set TSO context descriptor
  */
 static inline void
@@ -1307,17 +1338,16 @@ idpf_xmit_cleanup(struct idpf_tx_queue *txq)
uint16_t nb_tx_to_clean;
uint16_t i;
 
-   volatile struct idpf_flex_tx_desc *txd = txq->tx_ring;
+   volatile struct idpf_base_tx_desc *txd = txq->tx_ring;
 
desc_to_clean_to = (uint16_t)(last_desc_cleaned + txq->rs_thresh);
if (desc_to_clean_to >= nb_tx_desc)
desc_to_clean_to = (uint16_t)(desc_to_clean_to - nb_tx_desc);
 
desc_to_clean_to = sw_ring[desc_to_clean_to].last_id;
-   /* In the writeback Tx desccriptor, the only significant fields are the 
4-bit DTYPE */
-   if ((txd[desc_to_clean_to].qw1.cmd_dtype &
-rte_cpu_to_le_16(IDPF_TXD_QW1_DTYPE_M)) !=
-   rte_cpu_to_le_16(IDPF_TX_DESC_DTYPE_DESC_DONE)) {
+   if ((txd[desc_to_clean_to].qw1 &
+rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
+   rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE)) {
TX_LOG(DEBUG, "TX descriptor %4u is not done "
   "(port=%d queue=%d)", desc_to_clean_to,
   txq->port_id, txq->queue_id);
@@ -1331,10 +1361,7 @@ idpf_xmit_cleanup(struct idpf_tx_queue *txq)
nb_tx_to_clean = (uint16_t)(desc_to_clean_to -
last_desc_cleaned);
 
-   txd[desc_to_clean_to].qw1.cmd_dtype = 0;
-   txd[desc_to_clean_to].qw1.buf_size = 0;
-   for (i = 0; i < RTE_DIM(txd[desc_to_clean_to].qw1.flex.raw); i++)
-   txd[desc_to_clean_to].qw1.flex.raw[i] = 0;
+   txd[desc_to_clean_to].qw1 = 0;
 
txq->last_desc_cleaned = desc_to_clean_to;
txq->nb_free = (uint16_t)(txq->nb_free + nb_tx_to_clean);
@@ -1347,8 +1374,8 @@ uint16_t
 idpf_dp_singleq_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
  uint16_t nb_pkts)
 {
-   volatile struct idpf_flex_tx_desc *txd;
-   volatile struct idpf_flex_tx_desc *txr;
+   volatile struct idpf_base_tx_desc *txd;
+   volatile struct idpf_base_tx_desc *txr;
union idpf_tx_offload tx_offload = {0};
struct idpf_tx_entry *txe, *txn;
struct idpf_tx_entry *sw_ring;
@@ -1356,6 +1383,7 @@ idpf_dp_singleq_xmit_pkts(void *tx_queue, struct rte_mbuf 
**tx_pkts,
struct rte_mbuf *tx_pkt;
struct rte

Re: [PATCH v2 1/5] ethdev: support setting and querying RSS algorithm

2023-09-04 Thread Jie Hai

Hi, Thomas

Thanks for your review.

On 2023/8/30 19:46, Thomas Monjalon wrote:

Hello,

Thanks for bringing a new capability.

26/08/2023 09:46, Jie Hai:

Currently, rte_eth_rss_conf supports configuring and querying
rss hash functions, rss key and it's length, but not rss hash
algorithm.

The structure ``rte_eth_rss_conf`` is extended by adding a new
field "func". This represents the RSS algorithms to apply. The
following API is affected:
- rte_eth_dev_configure
- rte_eth_dev_rss_hash_update
- rte_eth_dev_rss_hash_conf_get


So far, the RSS algorithm was used only in flow RSS API.


Fixed in V3, these API will be affected.

--- a/doc/guides/rel_notes/release_23_11.rst
+++ b/doc/guides/rel_notes/release_23_11.rst
@@ -123,6 +123,8 @@ ABI Changes
 Also, make sure to start the actual text at the margin.
 ===
  
+   * ethdev: Added "func" field to ``rte_eth_rss_conf`` structure for RSS hash

+ algorithm.


As written above, it should start at the margin.


Fixed in V3.

--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
+#include "rte_flow.h"


It is strange to include rte_flow.h here.
It would be better to move the enum.


Fixed in V3, the definations are removed to rte_ethdev.h.

+ * The *func* field of the *rss_conf* structure indicates the hash algorithm
+ * applied by the RSS hashing. Passing RTE_ETH_HASH_FUNCTION_DEFAULT allows
+ * the PMD to use its best-effort algorithm rather than a specific one.
   */


I don't like commenting a field on top of the structure.
By the way, the first sentence does not look helpful.
RTE_ETH_HASH_FUNCTION_DEFAULT may be explained in the enum.


Other fields  above the structure 'rte_eth_rss_conf' have comments.
If the new fields 'func' do not have comments, it may be misleading.
Unless all the comments above are removed. I'm not sure whether to
delete them or not.

  struct rte_eth_rss_conf {
uint8_t *rss_key;/**< If not NULL, 40-byte hash key. */
uint8_t rss_key_len; /**< hash key length in bytes. */
uint64_t rss_hf; /**< Hash functions to apply - see below. */
+   enum rte_eth_hash_function func;/**< Hash algorithm to apply. */


You can drop "to apply" words.

Fixed in V3.


How the algorithms support combinations in rss_hf?


The rss_hf defines the input of the RSS hash algorithms.
For example, rss_hf = RTE_ETH_RSS_NONFRAG_IPV4_TCP | RTE_ETH_RSS_IPV4, 
these two kinds of packets can be dispatched to different queues 
according to their tuple field value.
For ipv4-tcp packets, src-ip, dst-ip, src-port, dst-port are used as 
parameters of RSS hash algorithms to compute hash value.

For ipv4 packets src-ip, dst-ip are used.

If rss_hf = RTE_ETH_RSS_NONFRAG_IPV4_TCP | RTE_ETH_RSS_L4_SRC_ONLY, for
ipv4-tcp packets, src-port is used as parameters of RSS hash algorithms 
to compute hash value.




.


Thanks,
Jie Hai


[PATCH v1] net/memif: fix segfault with large burst size

2023-09-04 Thread Joyce Kong
There will be a segfault when Rx burst size is greater than
MAX_PKT_BURST of memif. Fix the issue by correcting the
wrong mbuf index in eth_memif_rx, which results in accessing
invalid memory address.

Bugzilla ID: 1273
Fixes: aa17df860891 ("net/memif: add a Rx fast path")
Cc: sta...@dpdk.org

Signed-off-by: Joyce Kong 
Reviewed-by: Feifei Wang 
Reviewed-by: Ruifeng Wang 
---
 drivers/net/memif/rte_eth_memif.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/memif/rte_eth_memif.c 
b/drivers/net/memif/rte_eth_memif.c
index 6a8ff5b4eb..f595656af5 100644
--- a/drivers/net/memif/rte_eth_memif.c
+++ b/drivers/net/memif/rte_eth_memif.c
@@ -356,7 +356,7 @@ eth_memif_rx(void *queue, struct rte_mbuf **bufs, uint16_t 
nb_pkts)
rx_pkts = 0;
pkts = nb_pkts < MAX_PKT_BURST ? nb_pkts : MAX_PKT_BURST;
while (n_slots && rx_pkts < pkts) {
-   mbuf_head = mbufs[n_rx_pkts];
+   mbuf_head = mbufs[rx_pkts];
mbuf = mbuf_head;
 
 next_slot1:
-- 
2.25.1



[PATCH 3/5] app/proc-info: fix never show RSS info

2023-09-04 Thread Jie Hai
Command show-port should show RSS info (rss_key, len and rss_hf),
However, the information is showned only when rss_conf.rss_key is not
NULL. Since no memory is allocated for rss_conf.rss_key, rss_key
will always be NULL and the rss_info will never show. This patch
allocates memory for rss_conf.rss_key and makes it possible to show
RSS info.

Fixes: 8a37f37fc243 ("app/procinfo: add --show-port")
Cc: sta...@dpdk.org

Signed-off-by: Jie Hai 
Signed-off-by: Dongdong Liu 
---
 app/proc-info/main.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/app/proc-info/main.c b/app/proc-info/main.c
index 88cee0ca487b..f6b77a705dce 100644
--- a/app/proc-info/main.c
+++ b/app/proc-info/main.c
@@ -1013,6 +1013,7 @@ show_port(void)
struct rte_eth_fc_conf fc_conf;
struct rte_ether_addr mac;
struct rte_eth_dev_owner owner;
+   uint8_t *rss_key;
 
/* Skip if port is not in mask */
if ((enabled_port_mask & (1ul << i)) == 0)
@@ -1171,19 +1172,25 @@ show_port(void)
printf("\n");
}
 
+   rss_key = malloc(dev_info.hash_key_size * sizeof(uint8_t));
+   if (rss_key == NULL)
+   return;
+
+   rss_conf.rss_key = rss_key;
+   rss_conf.rss_key_len = dev_info.hash_key_size;
ret = rte_eth_dev_rss_hash_conf_get(i, &rss_conf);
if (ret == 0) {
-   if (rss_conf.rss_key) {
-   printf("  - RSS\n");
-   printf("\t  -- RSS len %u key (hex):",
-   rss_conf.rss_key_len);
-   for (k = 0; k < rss_conf.rss_key_len; k++)
-   printf(" %x", rss_conf.rss_key[k]);
-   printf("\t  -- hf 0x%"PRIx64"\n",
-   rss_conf.rss_hf);
-   }
+   printf("  - RSS\n");
+   printf("\t  -- RSS len %u key (hex):",
+   rss_conf.rss_key_len);
+   for (k = 0; k < rss_conf.rss_key_len; k++)
+   printf(" %x", rss_conf.rss_key[k]);
+   printf("\t  -- hf 0x%"PRIx64"\n",
+   rss_conf.rss_hf);
}
 
+   free(rss_key);
+
 #ifdef RTE_LIB_SECURITY
show_security_context(i, true);
 #endif
-- 
2.33.0



[PATCH 5/5] app/proc-info: support querying RSS hash algorithm

2023-09-04 Thread Jie Hai
Display RSS hash algorithm with command show-port as below.
  - RSS info
  -- hash algorithm : toeplitz

Signed-off-by: Jie Hai 
Signed-off-by: Dongdong Liu 
---
 app/proc-info/main.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/app/proc-info/main.c b/app/proc-info/main.c
index 6d2d77fea6ba..02b418a4c661 100644
--- a/app/proc-info/main.c
+++ b/app/proc-info/main.c
@@ -996,6 +996,23 @@ show_offloads(uint64_t offloads,
}
 }
 
+static const char *
+rss_func_to_str(enum rte_eth_hash_function func)
+{
+   switch (func) {
+   case RTE_ETH_HASH_FUNCTION_SIMPLE_XOR:
+   return "simple_xor";
+   case RTE_ETH_HASH_FUNCTION_TOEPLITZ:
+   return "toeplitz";
+   case RTE_ETH_HASH_FUNCTION_SYMMETRIC_TOEPLITZ:
+   return "symmetric_toeplitz";
+   case RTE_ETH_HASH_FUNCTION_DEFAULT:
+   return "default";
+   default:
+   return "unknown";
+   }
+}
+
 static void
 show_port(void)
 {
@@ -1188,6 +1205,8 @@ show_port(void)
printf("%02x", rss_conf.rss_key[k]);
printf("\n\t  -- hf : 0x%"PRIx64"\n",
rss_conf.rss_hf);
+   printf("\t  -- hash algorithm : %s\n",
+   rss_func_to_str(rss_conf.func));
}
 
free(rss_key);
-- 
2.33.0



[PATCH 2/5] net/hns3: support setting and querying RSS hash function

2023-09-04 Thread Jie Hai
From: Huisong Li 

Support setting and querying RSS hash function by ethdev ops.

Signed-off-by: Huisong Li 
Signed-off-by: Dongdong Liu 
---
 drivers/net/hns3/hns3_rss.c | 47 +
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/net/hns3/hns3_rss.c b/drivers/net/hns3/hns3_rss.c
index 6126512bd780..c8346d43d15c 100644
--- a/drivers/net/hns3/hns3_rss.c
+++ b/drivers/net/hns3/hns3_rss.c
@@ -646,14 +646,14 @@ hns3_dev_rss_hash_update(struct rte_eth_dev *dev,
if (ret)
goto set_tuple_fail;
 
-   if (key) {
-   ret = hns3_rss_set_algo_key(hw, hw->rss_info.hash_algo,
-   key, hw->rss_key_size);
-   if (ret)
-   goto set_algo_key_fail;
-   /* Update the shadow RSS key with user specified */
+   ret = hns3_update_rss_algo_key(hw, rss_conf->func, key, key_len);
+   if (ret != 0)
+   goto set_algo_key_fail;
+
+   if (rss_conf->func != RTE_ETH_HASH_FUNCTION_DEFAULT)
+   hw->rss_info.hash_algo = hns3_hash_func_map[rss_conf->func];
+   if (key != NULL)
memcpy(hw->rss_info.key, key, hw->rss_key_size);
-   }
hw->rss_info.rss_hf = rss_hf;
rte_spinlock_unlock(&hw->lock);
 
@@ -769,7 +769,13 @@ int
 hns3_dev_rss_hash_conf_get(struct rte_eth_dev *dev,
   struct rte_eth_rss_conf *rss_conf)
 {
+   const uint8_t hash_func_map[] = {
+   [HNS3_RSS_HASH_ALGO_TOEPLITZ] = RTE_ETH_HASH_FUNCTION_TOEPLITZ,
+   [HNS3_RSS_HASH_ALGO_SIMPLE] = RTE_ETH_HASH_FUNCTION_SIMPLE_XOR,
+   [HNS3_RSS_HASH_ALGO_SYMMETRIC_TOEP] = 
RTE_ETH_HASH_FUNCTION_SYMMETRIC_TOEPLITZ,
+   };
struct hns3_adapter *hns = dev->data->dev_private;
+   uint8_t rss_key[HNS3_RSS_KEY_SIZE_MAX] = {0};
struct hns3_hw *hw = &hns->hw;
uint8_t hash_algo;
int ret;
@@ -777,26 +783,27 @@ hns3_dev_rss_hash_conf_get(struct rte_eth_dev *dev,
rte_spinlock_lock(&hw->lock);
ret = hns3_rss_hash_get_rss_hf(hw, &rss_conf->rss_hf);
if (ret != 0) {
+   rte_spinlock_unlock(&hw->lock);
hns3_err(hw, "obtain hash tuples failed, ret = %d", ret);
-   goto out;
+   return ret;
+   }
+
+   ret = hns3_rss_get_algo_key(hw, &hash_algo, rss_key, hw->rss_key_size);
+   if (ret != 0) {
+   rte_spinlock_unlock(&hw->lock);
+   hns3_err(hw, "obtain hash algo and key failed, ret = %d", ret);
+   return ret;
}
+   rte_spinlock_unlock(&hw->lock);
 
-   /* Get the RSS Key required by the user */
+   /* Get the RSS Key if user required. */
if (rss_conf->rss_key && rss_conf->rss_key_len >= hw->rss_key_size) {
-   ret = hns3_rss_get_algo_key(hw, &hash_algo, rss_conf->rss_key,
-   hw->rss_key_size);
-   if (ret != 0) {
-   hns3_err(hw, "obtain hash algo and key failed, ret = 
%d",
-ret);
-   goto out;
-   }
+   memcpy(rss_conf->rss_key, rss_key, hw->rss_key_size);
rss_conf->rss_key_len = hw->rss_key_size;
}
+   rss_conf->func = hash_func_map[hash_algo];
 
-out:
-   rte_spinlock_unlock(&hw->lock);
-
-   return ret;
+   return 0;
 }
 
 /*
-- 
2.33.0



[PATCH 4/5] app/proc-info: adjust the display format of RSS info

2023-09-04 Thread Jie Hai
This patch splits the length and value of RSS key into two parts,
removes spaces between RSS keys, and adds line breaks between RSS
key and RSS hf.

Before the adjustment, RSS info is shown as:
  - RSS
  -- RSS len 40 key (hex): 6d 5a 56 da 25 5b e c2 41 67 \
 25 3d 43 a3 8f b0 d0 ca 2b cb ae 7b 30 b4 77 cb 2d \
 a3 80 30 f2 c 6a 42 b7 3b be ac 1 fa -- hf 0x0
and after:
  - RSS info
  -- key len : 40
  -- key (hex) : 6d5a56da255b0ec24167253d43a38fb0d0c \
a2bcbae7b30b477cb2da38030f20c6a42b73bbeac01fa
  -- hf : 0x0

Fixes: 8a37f37fc243 ("app/procinfo: add --show-port")
Cc: sta...@dpdk.org

Signed-off-by: Jie Hai 
---
 app/proc-info/main.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/app/proc-info/main.c b/app/proc-info/main.c
index f6b77a705dce..6d2d77fea6ba 100644
--- a/app/proc-info/main.c
+++ b/app/proc-info/main.c
@@ -1180,12 +1180,13 @@ show_port(void)
rss_conf.rss_key_len = dev_info.hash_key_size;
ret = rte_eth_dev_rss_hash_conf_get(i, &rss_conf);
if (ret == 0) {
-   printf("  - RSS\n");
-   printf("\t  -- RSS len %u key (hex):",
+   printf("  - RSS info\n");
+   printf("\t  -- key len : %u\n",
rss_conf.rss_key_len);
+   printf("\t  -- key (hex) : ");
for (k = 0; k < rss_conf.rss_key_len; k++)
-   printf(" %x", rss_conf.rss_key[k]);
-   printf("\t  -- hf 0x%"PRIx64"\n",
+   printf("%02x", rss_conf.rss_key[k]);
+   printf("\n\t  -- hf : 0x%"PRIx64"\n",
rss_conf.rss_hf);
}
 
-- 
2.33.0



[PATCH 0/5] support setting and querying RSS algorithms

2023-09-04 Thread Jie Hai
This patchset is to support setting and querying RSS algorithms.

--
v3:
1. fix commit log for PATCH [1/5].
2. make RSS ABI changes description to start the actual text at the margin.
3. move defnition of enum rte_eth_hash_function to rte_ethdev.h.
4. fix some comment codes.

v2:
1. return error if "func" is invalid.
2. modify the comments of the "func" field.
3. modify commit log of patch [3/5].
4. use malloc instead of rte_malloc.
5. adjust display format of RSS info.
6. remove the string display of rss_hf.
--

Huisong Li (1):
  net/hns3: support setting and querying RSS hash function

Jie Hai (4):
  ethdev: support setting and querying RSS algorithm
  app/proc-info: fix never show RSS info
  app/proc-info: adjust the display format of RSS info
  app/proc-info: support querying RSS hash algorithm

 app/proc-info/main.c   | 45 +++-
 doc/guides/rel_notes/release_23_11.rst |  2 ++
 drivers/net/hns3/hns3_rss.c| 47 +++---
 lib/ethdev/rte_ethdev.c| 17 ++
 lib/ethdev/rte_ethdev.h| 20 +++
 lib/ethdev/rte_flow.c  |  1 -
 lib/ethdev/rte_flow.h  | 18 ++
 7 files changed, 104 insertions(+), 46 deletions(-)

-- 
2.33.0



[PATCH 1/5] ethdev: support setting and querying RSS algorithm

2023-09-04 Thread Jie Hai
Currently, rte_eth_rss_conf supports configuring and querying
RSS hash functions, rss key and it's length, but not RSS hash
algorithm.

The structure ``rte_eth_rss_conf`` is extended by adding a new
field "func". This represents the RSS algorithms to apply. The
following API will be affected:
- rte_eth_dev_configure
- rte_eth_dev_rss_hash_update
- rte_eth_dev_rss_hash_conf_get

If the value of "func" used for configuration is a gibberish
value, report the error and return. Do the same for
rte_eth_dev_rss_hash_update and rte_eth_dev_configure.

To check whether the drivers report the "func" field, it is set
to default value before querying.

Signed-off-by: Jie Hai 
Signed-off-by: Dongdong Liu 
---
 doc/guides/rel_notes/release_23_11.rst |  2 ++
 lib/ethdev/rte_ethdev.c| 17 +
 lib/ethdev/rte_ethdev.h| 20 
 lib/ethdev/rte_flow.c  |  1 -
 lib/ethdev/rte_flow.h  | 18 ++
 5 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/doc/guides/rel_notes/release_23_11.rst 
b/doc/guides/rel_notes/release_23_11.rst
index 4411bb32c195..94239170139d 100644
--- a/doc/guides/rel_notes/release_23_11.rst
+++ b/doc/guides/rel_notes/release_23_11.rst
@@ -123,6 +123,8 @@ ABI Changes
Also, make sure to start the actual text at the margin.
===
 
+* ethdev: Added "func" field to ``rte_eth_rss_conf`` structure for RSS hash
+  algorithm.
 
 Known Issues
 
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 0840d2b5942a..4cbcdb344cac 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -1445,6 +1445,14 @@ rte_eth_dev_configure(uint16_t port_id, uint16_t 
nb_rx_q, uint16_t nb_tx_q,
goto rollback;
}
 
+   if (dev_conf->rx_adv_conf.rss_conf.func >= RTE_ETH_HASH_FUNCTION_MAX) {
+   RTE_ETHDEV_LOG(ERR,
+   "Ethdev port_id=%u invalid rss hash function (%u)\n",
+   port_id, dev_conf->rx_adv_conf.rss_conf.func);
+   ret = -EINVAL;
+   goto rollback;
+   }
+
/* Check if Rx RSS distribution is disabled but RSS hash is enabled. */
if (((dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG) == 0) &&
(dev_conf->rxmode.offloads & RTE_ETH_RX_OFFLOAD_RSS_HASH)) {
@@ -4630,6 +4638,13 @@ rte_eth_dev_rss_hash_update(uint16_t port_id,
return -ENOTSUP;
}
 
+   if (rss_conf->func >= RTE_ETH_HASH_FUNCTION_MAX) {
+   RTE_ETHDEV_LOG(ERR,
+   "Ethdev port_id=%u invalid rss hash function (%u)\n",
+   port_id, rss_conf->func);
+   return -EINVAL;
+   }
+
if (*dev->dev_ops->rss_hash_update == NULL)
return -ENOTSUP;
ret = eth_err(port_id, (*dev->dev_ops->rss_hash_update)(dev,
@@ -4657,6 +4672,8 @@ rte_eth_dev_rss_hash_conf_get(uint16_t port_id,
return -EINVAL;
}
 
+   rss_conf->func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+
if (*dev->dev_ops->rss_hash_conf_get == NULL)
return -ENOTSUP;
ret = eth_err(port_id, (*dev->dev_ops->rss_hash_conf_get)(dev,
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index 04a2564f222a..1fe3c0a502ac 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -445,6 +445,22 @@ struct rte_vlan_filter_conf {
uint64_t ids[64];
 };
 
+/**
+ * Hash function types.
+ */
+enum rte_eth_hash_function {
+   RTE_ETH_HASH_FUNCTION_DEFAULT = 0,
+   RTE_ETH_HASH_FUNCTION_TOEPLITZ, /**< Toeplitz */
+   RTE_ETH_HASH_FUNCTION_SIMPLE_XOR, /**< Simple XOR */
+   /**
+* Symmetric Toeplitz: src, dst will be replaced by
+* xor(src, dst). For the case with src/dst only,
+* src or dst address will xor with zero pair.
+*/
+   RTE_ETH_HASH_FUNCTION_SYMMETRIC_TOEPLITZ,
+   RTE_ETH_HASH_FUNCTION_MAX,
+};
+
 /**
  * A structure used to configure the Receive Side Scaling (RSS) feature
  * of an Ethernet port.
@@ -461,11 +477,15 @@ struct rte_vlan_filter_conf {
  * The *rss_hf* field of the *rss_conf* structure indicates the different
  * types of IPv4/IPv6 packets to which the RSS hashing must be applied.
  * Supplying an *rss_hf* equal to zero disables the RSS feature.
+ *
+ * The *func* field of the *rss_conf* structure indicates the hash algorithm
+ * applied by the RSS hashing.
  */
 struct rte_eth_rss_conf {
uint8_t *rss_key;/**< If not NULL, 40-byte hash key. */
uint8_t rss_key_len; /**< hash key length in bytes. */
uint64_t rss_hf; /**< Hash functions to apply - see below. */
+   enum rte_eth_hash_function func;/**< Hash algorithm. */
 };
 
 /*
diff --git a/lib/ethdev/rte_flow.c b/lib/ethdev/rte_flow.c
index 271d854f7807..d3f2d466d841 100644
--- a/lib/ethdev/rte_flow.c
+++

Re: [PATCH v3 0/5] support setting and querying RSS algorithms

2023-09-04 Thread Jie Hai

Hi, ALL

Sorry, please drop this patchset.
Here is V3
https://inbox.dpdk.org/dev/20230904072851.7384-1-haij...@huawei.com/

Thanks,
Jie Hai

On 2023/9/4 14:10, Jie Hai wrote:

This patchset is to support setting and querying RSS algorithms.

--
v3:
1. fix commit log for PATCH [1/5].
2. make RSS ABI changes description to start the actual text at the margin.
3. move defnition of enum rte_eth_hash_function to rte_ethdev.h.
4. fix some comment codes.

v2:
1. return error if "func" is invalid.
2. modify the comments of the "func" field.
3. modify commit log of patch [3/5].
4. use malloc instead of rte_malloc.
5. adjust display format of RSS info.
6. remove the string display of rss_hf.
--


Jie Hai (5):
   app/proc-info: fix never show RSS info
   app/proc-info: adjust the display format of RSS info
   app/proc-info: support querying RSS hash algorithm
   app/testpmd: add RSS hash algorithms display
   app/testpmd: add RSS hash algorithms setting

  app/proc-info/main.c   |  45 ---
  app/test-pmd/cmdline.c | 128 ++---
  app/test-pmd/config.c  |  62 +++-
  app/test-pmd/testpmd.h |   5 +-
  4 files changed, 194 insertions(+), 46 deletions(-)



Re: [PATCH v2 1/5] ethdev: support setting and querying RSS algorithm

2023-09-04 Thread Thomas Monjalon
04/09/2023 09:10, Jie Hai:
> On 2023/8/30 19:46, Thomas Monjalon wrote:
> > 26/08/2023 09:46, Jie Hai:
 >> + * The *func* field of the *rss_conf* structure indicates the hash 
 >> algorithm
> >> + * applied by the RSS hashing. Passing RTE_ETH_HASH_FUNCTION_DEFAULT 
> >> allows
> >> + * the PMD to use its best-effort algorithm rather than a specific one.
> >>*/
> > 
> > I don't like commenting a field on top of the structure.
> > By the way, the first sentence does not look helpful.
> > RTE_ETH_HASH_FUNCTION_DEFAULT may be explained in the enum.
> > 
> Other fields  above the structure 'rte_eth_rss_conf' have comments.
> If the new fields 'func' do not have comments, it may be misleading.
> Unless all the comments above are removed. I'm not sure whether to
> delete them or not.

You should explain RTE_ETH_HASH_FUNCTION_DEFAULT in its enum.
The rest of the explanation can be on the struct field.
I'm OK to have another patch moving old explanations from the struct
to the fields.

> >>   struct rte_eth_rss_conf {
> >>uint8_t *rss_key;/**< If not NULL, 40-byte hash key. */
> >>uint8_t rss_key_len; /**< hash key length in bytes. */
> >>uint64_t rss_hf; /**< Hash functions to apply - see below. */
> >> +  enum rte_eth_hash_function func;/**< Hash algorithm to apply. */
> > 
> > You can drop "to apply" words.
> Fixed in V3.
> > 
> > How the algorithms support combinations in rss_hf?
> > 
> The rss_hf defines the input of the RSS hash algorithms.
> For example, rss_hf = RTE_ETH_RSS_NONFRAG_IPV4_TCP | RTE_ETH_RSS_IPV4, 
> these two kinds of packets can be dispatched to different queues 
> according to their tuple field value.
> For ipv4-tcp packets, src-ip, dst-ip, src-port, dst-port are used as 
> parameters of RSS hash algorithms to compute hash value.
> For ipv4 packets src-ip, dst-ip are used.
> 
> If rss_hf = RTE_ETH_RSS_NONFRAG_IPV4_TCP | RTE_ETH_RSS_L4_SRC_ONLY, for
> ipv4-tcp packets, src-port is used as parameters of RSS hash algorithms 
> to compute hash value.

I know what is rss_hf.
My question is about the algorithms.
Do they all support any combination in rss_hf?




Re: DPDK community: RTE_FLOW support for P4-programmable devices

2023-09-04 Thread Ferruh Yigit
On 9/1/2023 10:52 AM, Jerin Jacob wrote:
> On Fri, Sep 1, 2023 at 12:27 PM Ori Kam  wrote:
>>
>>
>>
>>> -Original Message-
>>> From: Jerin Jacob 
>>> Sent: Friday, September 1, 2023 5:07 AM
>>>
>>> On Thu, Aug 31, 2023 at 4:02 PM Ori Kam  wrote:

 Hi
>>>
>>>
>>> 3. Everybody on the call agreed that the P4-programmable devices from
> Intel,
>>> AMD and others need to be fully supported by DPDK and that there are
> some
>>> gaps in RTE_FLOW to be fixed for supporting these devices.
>>
>> Personally, It makes sense to me to have normative DPDK API to send p4
>> runtime message to the
>> ethdev so that we have "vendor neutral + DPDK based" p4 runtime
>>> backend.
>>
>> I prefer to have specialized ethdev ops for this due to the following
>>> reasons.
>>
>> # If the ethdev has both real TCAM based HW(for existing rte_flow
>> patterns and actions) and SW agent to receive P4 runtime message etc.
>> Typically, it needs to take a different path in driver to talk. Assume, 
>> if you
>> have cascaded patterns/actions, One is targeted for TCAM and other for
>> SW agent for p4, one
>> need to have serious amount checking for dispatching.It complicates
>> the driver and forbid to have
>> driver optimization especially cases for templates etc. if user making
>> rules for both category of HW.
>>
>
> Indeed I am not against dedicated APIs for P4 runtime backend.
>
> But assuming there is a dedicated rte_flow item for P4, how it is
> different than dedicated API in above scenario?
> If driver detects P4 runtime specific rule, it can bail it out to SW 
> agent.
> Can you please elaborate the complexity it introduces?
>>>
>>> Assume, normal existing rte-flow programming include a bunch of
>>> register writes and
>>> p4 runtime backend is more of SW ring. If a template has both patterns
>>> and actions
>>> as cascaded, it will be difficult for driver to optimize the template.
>>>

I assumed P4 specific item/action can take completely separate path, do
you think existing rte_flow item/actions and P4 item/actions can mix in
same rte_flow rule?


>>>
>
>> # All we need "char buffer//string" based communication ethdev <-> app.
>>
>
> Yes, and both a dedicated API or dedicated rte_flow item can provide
> medium for this communication.
>
> rte_flow one has flexibility & extensibility advantages, but maybe not
> as straightforward as an API.

 I think not using the rte_flow will also require duplication of all the 
 rule
>>> handling functions and table creations, for example aync rule create/destroy
>>> query ..
>>>
>>> Yes. That is a fair point. I am OK with exposing as rte_flow.
>>> As a driver implementation note, to get rid of the above problem,
>>> driver can choose to have pseudo ethdev devices for p4 if needed(if
>>> driver finds difficult to combine TCAM based on HW rules and p4
>>> runtime rule).
>>>
>>
>> What about the following concept:
>> The p4 compiler can generate the translation to known PMD objects in 
>> rte_flow,
>> then when a command is sent from the p4 agent to the offload using GRPC or 
>> any other way, the DPDK will convert from
>> p4 protocol to rte_flow commands (this should be very fast since the 
>> commands are known and the mapping is already
>> defined).
>>
>> To support the above idea we need to add two new functions to rte_flow (each 
>> function will be implemented in PMD level)
> 
> If the implemention of rte_flow_p4_runtime((p4 command based on the p4 spec))
> is defined in PMD level, The PMD driver to decide to use rte_flow or
> something else.
> I think it is good, this is actually going back to specialized API.
> BTW, rte_flow prefix is not needed in that case.
> 

+1 this is leaning to the dedicated API option.

> 
> 
>> Rte_flow_register_p4(void *p4_info, void *p4_blob)
>> {
>> Creates the static structures/objects
>> Internal register the p4 commands to PMD translation table.
>> }
>>
>> Rte_flow_p4_runtime(p4 command based on the p4 spec)
>> {
>> Based on the registered mapping, translate the command to rte_flow 
>> commands.
>> Rte_flow_xxx() calls

For the 'rte_flow_xxx()' calls here, my understanding is existing
rte_flow calls are not sufficient to meet the requirement to program the
P4 pipeline, that is why this patch is about defining the new flow item.

If these are not rte_flow calls but PMD defined code, than it is not
much different from previously suggested dedicated API solution.


>> }
>>
>> As far as I see, the above code will also allow PMD to implement internal 
>> logic if needed, while from DPDK API,
>> we will only add two new functions.
>>
>>>




RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode

2023-09-04 Thread Konstantin Ananyev


Hi Feifei,

> > > > > > Define specific function implementation for i40e driver.
> > > > > > Currently, mbufs recycle mode can support 128bit vector path and
> > > > > > avx2
> > > > path.
> > > > > > And can be enabled both in fast free and no fast free mode.
> > > > > >
> > > > > > Suggested-by: Honnappa Nagarahalli
> > > > > > 
> > > > > > Signed-off-by: Feifei Wang 
> > > > > > Reviewed-by: Ruifeng Wang 
> > > > > > Reviewed-by: Honnappa Nagarahalli
> > 
> > > > > > ---
> > > > > >  drivers/net/i40e/i40e_ethdev.c|   1 +
> > > > > >  drivers/net/i40e/i40e_ethdev.h|   2 +
> > > > > >  .../net/i40e/i40e_recycle_mbufs_vec_common.c  | 147
> > > > > > ++
> > > > > >  drivers/net/i40e/i40e_rxtx.c  |  32 
> > > > > >  drivers/net/i40e/i40e_rxtx.h  |   4 +
> > > > > >  drivers/net/i40e/meson.build  |   1 +
> > > > > >  6 files changed, 187 insertions(+)  create mode 100644
> > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > >
> > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c
> > > > > > b/drivers/net/i40e/i40e_ethdev.c index 8271bbb394..50ba9aac94
> > > > > > 100644
> > > > > > --- a/drivers/net/i40e/i40e_ethdev.c
> > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c
> > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops
> > > > > > i40e_eth_dev_ops
> > > > = {
> > > > > > .flow_ops_get = i40e_dev_flow_ops_get,
> > > > > > .rxq_info_get = i40e_rxq_info_get,
> > > > > > .txq_info_get = i40e_txq_info_get,
> > > > > > +   .recycle_rxq_info_get = i40e_recycle_rxq_info_get,
> > > > > > .rx_burst_mode_get= i40e_rx_burst_mode_get,
> > > > > > .tx_burst_mode_get= i40e_tx_burst_mode_get,
> > > > > > .timesync_enable  = i40e_timesync_enable,
> > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h
> > > > > > b/drivers/net/i40e/i40e_ethdev.h index 6f65d5e0ac..af758798e1
> > > > > > 100644
> > > > > > --- a/drivers/net/i40e/i40e_ethdev.h
> > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h
> > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct rte_eth_dev
> > > > > > *dev, uint16_t queue_id,
> > > > > > struct rte_eth_rxq_info *qinfo);  void
> > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
> > > > > > struct rte_eth_txq_info *qinfo);
> > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev,
> > > > > > +uint16_t
> > > > queue_id,
> > > > > > +   struct rte_eth_recycle_rxq_info *recycle_rxq_info);
> > > > > >  int i40e_rx_burst_mode_get(struct rte_eth_dev *dev, uint16_t
> > queue_id,
> > > > > >struct rte_eth_burst_mode *mode);  int
> > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t
> > > > > > queue_id, diff -- git
> > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > new file mode 100644
> > > > > > index 00..5663ecccde
> > > > > > --- /dev/null
> > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > @@ -0,0 +1,147 @@
> > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > + * Copyright (c) 2023 Arm Limited.
> > > > > > + */
> > > > > > +
> > > > > > +#include 
> > > > > > +#include 
> > > > > > +
> > > > > > +#include "base/i40e_prototype.h"
> > > > > > +#include "base/i40e_type.h"
> > > > > > +#include "i40e_ethdev.h"
> > > > > > +#include "i40e_rxtx.h"
> > > > > > +
> > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual"
> > > > > > +
> > > > > > +void
> > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue, uint16_t
> > > > > > +nb_mbufs) {
> > > > > > +   struct i40e_rx_queue *rxq = rx_queue;
> > > > > > +   struct i40e_rx_entry *rxep;
> > > > > > +   volatile union i40e_rx_desc *rxdp;
> > > > > > +   uint16_t rx_id;
> > > > > > +   uint64_t paddr;
> > > > > > +   uint64_t dma_addr;
> > > > > > +   uint16_t i;
> > > > > > +
> > > > > > +   rxdp = rxq->rx_ring + rxq->rxrearm_start;
> > > > > > +   rxep = &rxq->sw_ring[rxq->rxrearm_start];
> > > > > > +
> > > > > > +   for (i = 0; i < nb_mbufs; i++) {
> > > > > > +   /* Initialize rxdp descs. */
> > > > > > +   paddr = (rxep[i].mbuf)->buf_iova +
> > > > > > RTE_PKTMBUF_HEADROOM;
> > > > > > +   dma_addr = rte_cpu_to_le_64(paddr);
> > > > > > +   /* flush desc with pa dma_addr */
> > > > > > +   rxdp[i].read.hdr_addr = 0;
> > > > > > +   rxdp[i].read.pkt_addr = dma_addr;
> > > > > > +   }
> > > > > > +
> > > > > > +   /* Update the descriptor initializer index */
> > > > > > +   rxq->rxrearm_start += nb_mbufs;
> > > > > > +   rx_id = rxq->rxrearm_start - 1;
> > > > > > +
> > > > > > +   if (unlikely(rxq->rxrearm_start >= rxq->nb_rx_desc)) {
> > > > > > +   rxq->rxrearm_start = 0;
> > > > > > +   rx_id = rxq->nb_rx_desc - 1;
> > > > > > +   

Re: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on reconfigure

2023-09-04 Thread Bruce Richardson
On Mon, Sep 04, 2023 at 02:30:32AM +0100, Zhang, Qi Z wrote:
> 
> 
> > -Original Message-
> > From: Zhang, Qi Z
> > Sent: Monday, September 4, 2023 9:15 AM
> > To: Bruce Richardson ; dev@dpdk.org
> > Cc: Richardson, Bruce ; Wu, Jingjing
> > ; sta...@dpdk.org
> > Subject: RE: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on reconfigure
> > 
> > 
> > 
> > > -Original Message-
> > > From: Bruce Richardson 
> > > Sent: Thursday, August 31, 2023 8:34 PM
> > > To: dev@dpdk.org
> > > Cc: Richardson, Bruce ; Wu, Jingjing
> > > ; sta...@dpdk.org
> > > Subject: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on
> > > reconfigure
> > >
> > > After reconfiguring an RX queue the mbuf_initializer value was not
> > > being correctly set. Fix this by calling the appropriate function if
> > > vector processing is enabled. This mirrors the behaviour by the i40e 
> > > driver.
> > >
> > > Fixes: 319c421f3890 ("net/avf: enable SSE Rx Tx")
> > > Cc: jingjing...@intel.com
> > > Cc: sta...@dpdk.org
> > >
> > > Signed-off-by: Bruce Richardson 
> > > ---
> > >  drivers/net/iavf/iavf_rxtx.c | 7 +++
> > >  1 file changed, 7 insertions(+)
> > >
> > > diff --git a/drivers/net/iavf/iavf_rxtx.c
> > > b/drivers/net/iavf/iavf_rxtx.c index
> > > f7df4665d1..797cdda4b2 100644
> > > --- a/drivers/net/iavf/iavf_rxtx.c
> > > +++ b/drivers/net/iavf/iavf_rxtx.c
> > > @@ -755,6 +755,13 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev,
> > > uint16_t queue_idx,
> > >   if (check_rx_vec_allow(rxq) == false)
> > >   ad->rx_vec_allowed = false;
> > >
> > > +#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM
> > > + /* check vector conflict */
> > > + if (ad->rx_vec_allowed && iavf_rxq_vec_setup(rxq)) {
> > > + PMD_DRV_LOG(ERR, "Failed vector rx setup.");
> > > + return -EINVAL;
> > > + }
> > > +#endif
> > 
> > Bruce:
> > 
> > May I know more details about how to reproduce this issue?
> > As the iavf PMD does not support
> > RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP (i40e does)
> 
> OK, not sure if the patch 4/4 answered my question 😊
> 
> should I squash patch 3, 4 into one? , for my understanding patch 3 doesn't 
> appear to be a bug fix unless we announce 
> RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP.
> 
You may have a point. I was experimenting with queue reconfiguration which
is where I hit this issue.
However, even without queue reconfig support, the device still needs to
support queue-stop followed by queue-start, I think, and there may still be
an issue there - I'll have to check.

/Bruce


[PATCH v2] eal: fix memory initialization deadlock

2023-09-04 Thread Artemy Kovalyov
The issue arose due to the change in the DPDK read-write lock
implementation. That change added a new flag, RTE_RWLOCK_WAIT, designed
to prevent new read locks while a write lock is in the queue. However,
this change has led to a scenario where a recursive read lock, where a
lock is acquired twice by the same execution thread, can initiate a
sequence of events resulting in a deadlock:

Process 1 takes the first read lock.
Process 2 attempts to take a write lock, triggering RTE_RWLOCK_WAIT due
to the presence of a read lock. This makes process 2 enter a wait loop
until the read lock is released.
Process 1 tries to take a second read lock. However, since a write lock
is waiting (due to RTE_RWLOCK_WAIT), it also enters a wait loop until
the write lock is acquired and then released.

Both processes end up in a blocked state, unable to proceed, resulting
in a deadlock scenario.

Following these changes, the RW-lock no longer supports
recursion, implying that a single thread shouldn't obtain a read lock if
it already possesses one. The problem arises during initialization: the
rte_eal_init() function acquires the memory_hotplug_lock, and later on,
the sequence of calls rte_eal_memory_init() -> eal_memalloc_init() ->
rte_memseg_list_walk() acquires it again without releasing it. This
scenario introduces the risk of a potential deadlock when concurrent
write locks are applied to the same memory_hotplug_lock. To address this
we resolved the issue by replacing rte_memseg_list_walk() with
rte_memseg_list_walk_thread_unsafe().

Bugzilla ID: 1277
Fixes: 832cecc03d77 ("rwlock: prevent readers from starving writers")
Cc: sta...@dpdk.org

Signed-off-by: Artemy Kovalyov 
---
v2:
changed walk to thread-unsafe version in eal_dynmem_hugepage_init() 32-bit flow
---
 lib/eal/common/eal_common_dynmem.c   | 5 -
 lib/eal/include/generic/rte_rwlock.h | 4 
 lib/eal/linux/eal_memalloc.c | 7 +--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lib/eal/common/eal_common_dynmem.c 
b/lib/eal/common/eal_common_dynmem.c
index bdbbe233a0..0d5da40096 100644
--- a/lib/eal/common/eal_common_dynmem.c
+++ b/lib/eal/common/eal_common_dynmem.c
@@ -251,7 +251,10 @@ eal_dynmem_hugepage_init(void)
 */
memset(&dummy, 0, sizeof(dummy));
dummy.hugepage_sz = hpi->hugepage_sz;
-   if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
+   /*  memory_hotplug_lock is taken in rte_eal_init(), so it's
+*  safe to call thread-unsafe version.
+*/
+   if (rte_memseg_list_walk_thread_unsafe(hugepage_count_walk, 
&dummy) < 0)
return -1;
 
for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
diff --git a/lib/eal/include/generic/rte_rwlock.h 
b/lib/eal/include/generic/rte_rwlock.h
index 9e083bbc61..c98fc7d083 100644
--- a/lib/eal/include/generic/rte_rwlock.h
+++ b/lib/eal/include/generic/rte_rwlock.h
@@ -80,6 +80,10 @@ rte_rwlock_init(rte_rwlock_t *rwl)
 /**
  * Take a read lock. Loop until the lock is held.
  *
+ * @note The RW lock isn't recursive, so calling this function on the same
+ * lock twice without releasing it could potentially result in a deadlock
+ * scenario when a write lock is involved.
+ *
  * @param rwl
  *   A pointer to a rwlock structure.
  */
diff --git a/lib/eal/linux/eal_memalloc.c b/lib/eal/linux/eal_memalloc.c
index f8b1588cae..3705b41f5f 100644
--- a/lib/eal/linux/eal_memalloc.c
+++ b/lib/eal/linux/eal_memalloc.c
@@ -1740,7 +1740,10 @@ eal_memalloc_init(void)
eal_get_internal_configuration();
 
if (rte_eal_process_type() == RTE_PROC_SECONDARY)
-   if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
+   /*  memory_hotplug_lock is taken in rte_eal_init(), so it's
+*  safe to call thread-unsafe version.
+*/
+   if 
(rte_memseg_list_walk_thread_unsafe(secondary_msl_create_walk, NULL) < 0)
return -1;
if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
internal_conf->in_memory) {
@@ -1778,7 +1781,7 @@ eal_memalloc_init(void)
}
 
/* initialize all of the fd lists */
-   if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+   if (rte_memseg_list_walk_thread_unsafe(fd_list_create_walk, NULL))
return -1;
return 0;
 }
-- 
2.25.1



[PATCH] clarify purpose of empty cache lines

2023-09-04 Thread Morten Brørup
This patch introduces the generic RTE_CACHE_GUARD macro into the EAL, and
replaces vaguely described empty cache lines in the rte_ring structure
with this macro.

Although the implementation of the rte_ring structure assumes that the
hardware speculatively prefetches 1 cache line, this number can be changed
at build time by modifying RTE_CACHE_GUARD_LINES in rte_config.h.

The background and the RFC was discussed in this thread:
http://inbox.dpdk.org/dev/98cbd80474fa8b44bf855df32c47dc35d87...@smartserver.smartshare.dk/

Signed-off-by: Morten Brørup 
---
 config/rte_config.h  |  1 +
 lib/eal/include/rte_common.h | 13 +
 lib/ring/rte_ring_core.h |  6 +++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/config/rte_config.h b/config/rte_config.h
index 400e44e3cf..cfdf787724 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -37,6 +37,7 @@
 #define RTE_MAX_TAILQ 32
 #define RTE_LOG_DP_LEVEL RTE_LOG_INFO
 #define RTE_MAX_VFIO_CONTAINERS 64
+#define RTE_CACHE_GUARD_LINES 1
 
 /* bsd module defines */
 #define RTE_CONTIGMEM_MAX_NUM_BUFS 64
diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 771c70f2c8..daf1866a32 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -527,6 +527,19 @@ rte_is_aligned(const void * const __rte_restrict ptr, 
const unsigned int align)
 /** Force minimum cache line alignment. */
 #define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE)
 
+#define _RTE_CACHE_GUARD_HELPER2(unique) \
+   char cache_guard_ ## unique[RTE_CACHE_LINE_SIZE * 
RTE_CACHE_GUARD_LINES] \
+   __rte_cache_aligned
+#define _RTE_CACHE_GUARD_HELPER1(unique) _RTE_CACHE_GUARD_HELPER2(unique)
+/**
+ * Empty cache lines, to guard against false sharing-like effects
+ * on systems with a next-N-lines hardware prefetcher.
+ *
+ * Use as spacing between data accessed by different lcores,
+ * to prevent cache thrashing on hardware with speculative prefetching.
+ */
+#define RTE_CACHE_GUARD _RTE_CACHE_GUARD_HELPER1(__COUNTER__)
+
 /*** PA/IOVA type definitions /
 
 /** Physical address */
diff --git a/lib/ring/rte_ring_core.h b/lib/ring/rte_ring_core.h
index d1e59bf9ad..327fdcf28f 100644
--- a/lib/ring/rte_ring_core.h
+++ b/lib/ring/rte_ring_core.h
@@ -126,7 +126,7 @@ struct rte_ring {
uint32_t mask;   /**< Mask (size-1) of ring. */
uint32_t capacity;   /**< Usable size of ring */
 
-   char pad0 __rte_cache_aligned; /**< empty cache line */
+   RTE_CACHE_GUARD;
 
/** Ring producer status. */
union {
@@ -135,7 +135,7 @@ struct rte_ring {
struct rte_ring_rts_headtail rts_prod;
}  __rte_cache_aligned;
 
-   char pad1 __rte_cache_aligned; /**< empty cache line */
+   RTE_CACHE_GUARD;
 
/** Ring consumer status. */
union {
@@ -144,7 +144,7 @@ struct rte_ring {
struct rte_ring_rts_headtail rts_cons;
}  __rte_cache_aligned;
 
-   char pad2 __rte_cache_aligned; /**< empty cache line */
+   RTE_CACHE_GUARD;
 };
 
 #define RING_F_SP_ENQ 0x0001 /**< The default enqueue is "single-producer". */
-- 
2.17.1



[PATCH] mempool: add cache guard to per-lcore debug statistics

2023-09-04 Thread Morten Brørup
The per-lcore debug statistics, if enabled, are frequently written by
their individual lcores, so add a cache guard to prevent CPU cache
thrashing.

Depends-on: series-29415 ("clarify purpose of empty cache lines")

Signed-off-by: Morten Brørup 
---
 lib/mempool/rte_mempool.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index a05b25d5b9..f70bf36080 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -78,6 +78,7 @@ struct rte_mempool_debug_stats {
uint64_t get_fail_objs;/**< Objects that failed to be 
allocated. */
uint64_t get_success_blks; /**< Successful allocation number of 
contiguous blocks. */
uint64_t get_fail_blks;/**< Failed allocation number of 
contiguous blocks. */
+   RTE_CACHE_GUARD;
 } __rte_cache_aligned;
 #endif
 
-- 
2.17.1



Re: [PATCH] clarify purpose of empty cache lines

2023-09-04 Thread Bruce Richardson
On Mon, Sep 04, 2023 at 10:43:49AM +0200, Morten Brørup wrote:
> This patch introduces the generic RTE_CACHE_GUARD macro into the EAL, and
> replaces vaguely described empty cache lines in the rte_ring structure
> with this macro.
> 
> Although the implementation of the rte_ring structure assumes that the
> hardware speculatively prefetches 1 cache line, this number can be changed
> at build time by modifying RTE_CACHE_GUARD_LINES in rte_config.h.
> 
> The background and the RFC was discussed in this thread:
> http://inbox.dpdk.org/dev/98cbd80474fa8b44bf855df32c47dc35d87...@smartserver.smartshare.dk/
> 
> Signed-off-by: Morten Brørup 
> ---
Seems fine to me.

Acked-by: Bruce Richardson 


RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode

2023-09-04 Thread Feifei Wang



> -Original Message-
> From: Konstantin Ananyev 
> Sent: Monday, September 4, 2023 3:50 PM
> To: Feifei Wang ; Konstantin Ananyev
> 
> Cc: dev@dpdk.org; nd ; Honnappa Nagarahalli
> ; Ruifeng Wang
> ; Yuying Zhang ; Beilei
> Xing ; nd ; nd ; nd
> 
> Subject: RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
> 
> 
> Hi Feifei,
> 
> > > > > > > Define specific function implementation for i40e driver.
> > > > > > > Currently, mbufs recycle mode can support 128bit vector path
> > > > > > > and
> > > > > > > avx2
> > > > > path.
> > > > > > > And can be enabled both in fast free and no fast free mode.
> > > > > > >
> > > > > > > Suggested-by: Honnappa Nagarahalli
> > > > > > > 
> > > > > > > Signed-off-by: Feifei Wang 
> > > > > > > Reviewed-by: Ruifeng Wang 
> > > > > > > Reviewed-by: Honnappa Nagarahalli
> > > 
> > > > > > > ---
> > > > > > >  drivers/net/i40e/i40e_ethdev.c|   1 +
> > > > > > >  drivers/net/i40e/i40e_ethdev.h|   2 +
> > > > > > >  .../net/i40e/i40e_recycle_mbufs_vec_common.c  | 147
> > > > > > > ++
> > > > > > >  drivers/net/i40e/i40e_rxtx.c  |  32 
> > > > > > >  drivers/net/i40e/i40e_rxtx.h  |   4 +
> > > > > > >  drivers/net/i40e/meson.build  |   1 +
> > > > > > >  6 files changed, 187 insertions(+)  create mode 100644
> > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > >
> > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > b/drivers/net/i40e/i40e_ethdev.c index
> > > > > > > 8271bbb394..50ba9aac94
> > > > > > > 100644
> > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c
> > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops
> > > > > > > i40e_eth_dev_ops
> > > > > = {
> > > > > > >   .flow_ops_get = i40e_dev_flow_ops_get,
> > > > > > >   .rxq_info_get = i40e_rxq_info_get,
> > > > > > >   .txq_info_get = i40e_txq_info_get,
> > > > > > > + .recycle_rxq_info_get = i40e_recycle_rxq_info_get,
> > > > > > >   .rx_burst_mode_get= i40e_rx_burst_mode_get,
> > > > > > >   .tx_burst_mode_get= i40e_tx_burst_mode_get,
> > > > > > >   .timesync_enable  = i40e_timesync_enable,
> > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > b/drivers/net/i40e/i40e_ethdev.h index
> > > > > > > 6f65d5e0ac..af758798e1
> > > > > > > 100644
> > > > > > > --- a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h
> > > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct
> > > > > > > rte_eth_dev *dev, uint16_t queue_id,
> > > > > > >   struct rte_eth_rxq_info *qinfo);  void
> > > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
> > > > > > >   struct rte_eth_txq_info *qinfo);
> > > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev,
> > > > > > > +uint16_t
> > > > > queue_id,
> > > > > > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info);
> > > > > > >  int i40e_rx_burst_mode_get(struct rte_eth_dev *dev,
> > > > > > > uint16_t
> > > queue_id,
> > > > > > >  struct rte_eth_burst_mode *mode);  int
> > > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t
> > > > > > > queue_id, diff -- git
> > > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > new file mode 100644
> > > > > > > index 00..5663ecccde
> > > > > > > --- /dev/null
> > > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > @@ -0,0 +1,147 @@
> > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > + * Copyright (c) 2023 Arm Limited.
> > > > > > > + */
> > > > > > > +
> > > > > > > +#include 
> > > > > > > +#include 
> > > > > > > +
> > > > > > > +#include "base/i40e_prototype.h"
> > > > > > > +#include "base/i40e_type.h"
> > > > > > > +#include "i40e_ethdev.h"
> > > > > > > +#include "i40e_rxtx.h"
> > > > > > > +
> > > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual"
> > > > > > > +
> > > > > > > +void
> > > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue,
> > > > > > > +uint16_t
> > > > > > > +nb_mbufs) {
> > > > > > > + struct i40e_rx_queue *rxq = rx_queue;
> > > > > > > + struct i40e_rx_entry *rxep;
> > > > > > > + volatile union i40e_rx_desc *rxdp;
> > > > > > > + uint16_t rx_id;
> > > > > > > + uint64_t paddr;
> > > > > > > + uint64_t dma_addr;
> > > > > > > + uint16_t i;
> > > > > > > +
> > > > > > > + rxdp = rxq->rx_ring + rxq->rxrearm_start;
> > > > > > > + rxep = &rxq->sw_ring[rxq->rxrearm_start];
> > > > > > > +
> > > > > > > + for (i = 0; i < nb_mbufs; i++) {
> > > > > > > + /* Initialize rxdp descs. */
> > > > > > > + paddr = (rxep[i].mbuf)->buf_iova +
> > > > > > > RTE_PKTMBUF_HEADROOM;
> > > > > > > + dma_addr = rte_cpu_to_le_64(paddr);
> > > 

[PATCH] eal: add cache guard to per-lcore PRNG state

2023-09-04 Thread Morten Brørup
The per-lcore random state is frequently updated by their individual
lcores, so add a cache guard to prevent CPU cache thrashing.

Depends-on: series-29415 ("clarify purpose of empty cache lines")

Signed-off-by: Morten Brørup 
---
 lib/eal/common/rte_random.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
index 565f2401ce..3df0c7004a 100644
--- a/lib/eal/common/rte_random.c
+++ b/lib/eal/common/rte_random.c
@@ -18,6 +18,7 @@ struct rte_rand_state {
uint64_t z3;
uint64_t z4;
uint64_t z5;
+   RTE_CACHE_GUARD;
 } __rte_cache_aligned;
 
 /* One instance each for every lcore id-equipped thread, and one
-- 
2.17.1



ring name size

2023-09-04 Thread Thomas Monjalon
Hello,

When creating a ring, we need to know the maximum length of the name.
It seems this value is not documented.
And there is no constant defined.
There is only RTE_MEMZONE_NAMESIZE.

Should we document the maximum length as 31 and add a constant?
#define RTE_RING_NAME_SIZE RTE_MEMZONE_NAMESIZE




Re: 22.11.3 patches review and test

2023-09-04 Thread Kevin Traynor

On 01/09/2023 04:23, Zeng, ZhichaoX wrote:

-Original Message-
From: Kevin Traynor 
Sent: Thursday, August 31, 2023 8:18 PM
To: Xu, HailinX ; Xueming Li ;
sta...@dpdk.org; Wu, Jingjing ; Xing, Beilei
; Xu, Ke1 ; Zeng, ZhichaoX
; Zhang, Qi Z 
Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian ;
Mcnamara, John ; Luca Boccassi
; Xu, Qian Q ; Thomas Monjalon
; Peng, Yuan ; Chen,
Zhaoyan 
Subject: Re: 22.11.3 patches review and test

On 30/08/2023 17:25, Kevin Traynor wrote:

On 29/08/2023 09:22, Xu, HailinX wrote:

-Original Message-
From: Xueming Li 
Sent: Thursday, August 17, 2023 2:14 PM
To: sta...@dpdk.org
Cc: xuemi...@nvdia.com; dev@dpdk.org; Abhishek Marathe
; Ali Alnubani ;
Walker, Benjamin ; David Christensen
; Hemant Agrawal

;

Stokes, Ian ; Jerin Jacob
; Mcnamara, John ;
Ju-Hyoung Lee ; Kevin Traynor
; Luca Boccassi ; Pei Zhang
; Xu, Qian Q ; Raslan
Darawsheh ; Thomas Monjalon
; Yanghang Liu ; Peng,
Yuan ; Chen, Zhaoyan



Subject: 22.11.3 patches review and test

Hi all,

Here is a list of patches targeted for stable release 22.11.3.

The planned date for the final release is 31th August.

Please help with testing and validation of your use cases and report
any issues/results with reply-all to this mail. For the final
release the fixes and reported validations will be added to the release

notes.


A release candidate tarball can be found at:

   https://dpdk.org/browse/dpdk-stable/tag/?id=v22.11.3-rc1

These patches are located at branch 22.11 of dpdk-stable repo:
   https://dpdk.org/browse/dpdk-stable/

Thanks.


We are conducting DPDK testing and have found two issues.

1. The startup speed of testpmd is significantly slower in the os of SUSE
 This issue fix patch has been merged into main, But it has not backported

to 22.11.3.

 Fix patch commit id on DPDK main:
7e7b6762eac292e78c77ad37ec0973c0c944b845

2. The SCTP tunnel packet of iavf cannot be forwarded in avx512 mode


Need to clarify this sentence. It looks like it is not a functional bug where
avx512 mode is selected and then an SCTP tunnel packet cannot be sent.
Instead, it is a possible performance issue that avx512 mode will not be
selected where it might have been due to unneeded additions
(RTE_ETH_TX_OFFLOAD_*_TNL_TSO) to IAVF_TX_NO_VECTOR_FLAGS.

@IAVF maintainers - please confirm my analysis is correct ?

In that case, as it is a possible performance issue in a specific case for a 
single
driver I think it is non-critical for 21.11 and we can just revert the patch on 
the
branch and wait for 21.11.6 release in December.


Hi Kevin,

Since the LTS version of the IAVF driver does not support avx512 checksum 
offload,
the scalar path should be selected, but this patch makes it incorrectly select 
the
avx512 path, and the SCTP tunnel packets can't be forwarded properly.



ok, let's take a look at the patch and usage.

diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 8d4a77271a..605ea3f824 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -32,4 +32,8 @@
RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
RTE_ETH_TX_OFFLOAD_TCP_TSO | \
+   RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
+   RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
+   RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
+   RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
RTE_ETH_TX_OFFLOAD_SECURITY)



So we now have:
#define IAVF_TX_NO_VECTOR_FLAGS (\
RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
RTE_ETH_TX_OFFLOAD_TCP_TSO | \
RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
RTE_ETH_TX_OFFLOAD_SECURITY)



static inline int
iavf_tx_vec_queue_default(struct iavf_tx_queue *txq)
{
if (!txq)
return -1;

if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
return -1;

if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
return -1;
^^^ Adding the extra bits to IAVF_TX_NO_VECTOR_FLAGS gives 
*more* reasons for why this statement might not be true, so returning -1 
indicating that vector cannot be used for tx queue





static inline bool
check_tx_vec_allow(struct iavf_tx_queue *txq)
{
if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&

^^^ Adding the extra bits to IAVF_TX_NO_VECTOR_FLAGS gives 
*more* reason for this statement will be false and then false returned 
indicating that vector cannot be used.


txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&

RE: ring name size

2023-09-04 Thread Morten Brørup
> From: Thomas Monjalon [mailto:tho...@monjalon.net]
> Sent: Monday, 4 September 2023 11.31
> 
> Hello,
> 
> When creating a ring, we need to know the maximum length of the name.
> It seems this value is not documented.
> And there is no constant defined.

It is defined (with a description) here:
https://elixir.bootlin.com/dpdk/latest/source/lib/ring/rte_ring_core.h#L52

> There is only RTE_MEMZONE_NAMESIZE.
> 
> Should we document the maximum length as 31 and add a constant?
> #define RTE_RING_NAME_SIZE RTE_MEMZONE_NAMESIZE
> 



Re: [PATCH] usertools: suggest use of hwloc for new cpu

2023-09-04 Thread Ferruh Yigit
On 8/14/2023 10:25 AM, Konstantin Ananyev wrote:
> 
>> On Sun, Aug 13, 2023 at 08:52:01AM -0700, Stephen Hemminger wrote:
>>> On Sun, 13 Aug 2023 02:12:03 +
>>> "Varghese, Vipin"  wrote:
>>>
>
> On Sat, 12 Aug 2023 06:27:20 +0530
> Vipin Varghese  wrote:
>
>> Most modern processor now supports numa by partitioning NUMA based on
>> CPU-IO & Last Level Cache within the same socket.
>> As per the discussion in mailing list, suggesting the make use of
>> hw-loc for such scenarios.
>>
>> Signed-off-by: Vipin Varghese 
>
> NAK, no scripting hwloc, it is ugly and creates a dependency that is not 
> listed
> in DPDK packaging.

 There is no calls to hwloc within in thescript. Hence not clear what does 
 ` NAK, no scripting hwloc it is ugly and creates a
>> dependency that is not listed in DPDK packaging.`.

 Requesting to cross check why NAK is shared for `print` as suggestion. 
 Hence, I have disagree to this.
>>>
>>> Sorry, I misinterpreted what the print's were doing.
>>> Better off not to list exact flags, the lstopo may change and user may want 
>>> different
>>> format anyway.
>>>
>>> How about something like this?
>>>
>>>
>>>  doc/guides/rel_notes/deprecation.rst | 5 +
>>>  usertools/cpu_layout.py  | 5 +
>>>  2 files changed, 10 insertions(+)
>>>
>>> diff --git a/doc/guides/rel_notes/deprecation.rst 
>>> b/doc/guides/rel_notes/deprecation.rst
>>> index 317875c5054b..25a116900dfb 100644
>>> --- a/doc/guides/rel_notes/deprecation.rst
>>> +++ b/doc/guides/rel_notes/deprecation.rst
>>> @@ -185,3 +185,8 @@ Deprecation Notices
>>>will be deprecated and subsequently removed in DPDK 24.11 release.
>>>Before this, the new port library API (functions rte_swx_port_*)
>>>will gradually transition from experimental to stable status.
>>> +
>>> +* cpulayout: The CPU layout script is unable to deal with all the possible
>>> +  complexities of modern CPU topology. Other existing tools offer more
>>> +  features and do a better job with keeping up with innovations.
>>> +  Therefore it will be deprecated and removed in a future release.
>>
>> Does the script really do that bad a job? While I can understand us looking
>> to recommend alternatives, I actually find the script in it's current form
>> really handy - much more so than working out the exact flags for lstopo
>> etc. Since it's not a large maintenance burden, I'd request we keep it
>> around - while still recommending lstopo to users.
> 
> +1
> I do use it on regular basis.
> It would be a pity if it will be gone.
>

I also use it time to time and find it useful.

But it is not accurate/correct for some AMD platforms (for various NPS
(Nodes per Socket) values).
So either it needs to be updated/improved or replaced.

Vipin sent a patch [1] to update it but it is question how much of this
logic belongs to DPDK, or should we rely on external tools dedicated for
this purpose.


Both are OK, if we can give a decision we can proceed on that direction.


[1]
https://patchwork.dpdk.org/project/dpdk/patch/20220326073207.489694-1-vipin.vargh...@amd.com/


RE: [RFC] lib/st_ring: add single thread ring

2023-09-04 Thread Konstantin Ananyev



> Add a single thread safe and multi-thread unsafe ring data structure.
> This library provides an simple and efficient alternative to multi-thread
> safe ring when multi-thread safety is not required.

Just a thought: do we really need whole new library for that?
>From what I understand all we need right now just one extra function:
rte_ring_mt_unsafe_prod_deque(...)
Sorry for ugly name :)
To dequeue N elems from prod.tail.
Or you think there would be some extra advantages in ST version of the ring:
extra usages, better performance, etc.?

> 
> Signed-off-by: Honnappa Nagarahalli 
> ---
> v1:
> 1) The code is very prelimnary and is not even compiled
> 2) This is intended to show the APIs and some thoughts on implementation
> 3) More APIs and the rest of the implementation will come in subsequent
>versions
> 
>  lib/st_ring/rte_st_ring.h | 567 ++
>  1 file changed, 567 insertions(+)
>  create mode 100644 lib/st_ring/rte_st_ring.h
> 
> diff --git a/lib/st_ring/rte_st_ring.h b/lib/st_ring/rte_st_ring.h
> new file mode 100644
> index 00..8cb8832591
> --- /dev/null
> +++ b/lib/st_ring/rte_st_ring.h
> @@ -0,0 +1,567 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2023 Arm Limited
> + */
> +
> +#ifndef _RTE_ST_RING_H_
> +#define _RTE_ST_RING_H_
> +
> +/**
> + * @file
> + * RTE Signle Thread Ring (ST Ring)
> + *
> + * The ST Ring is a fixed-size queue intended to be accessed
> + * by one thread at a time. It does not provide concurrent access to
> + * multiple threads. If there are multiple threads accessing the ST ring,
> + * then the threads have to use locks to protect the ring from
> + * getting corrupted.
> + *
> + * - FIFO (First In First Out)
> + * - Maximum size is fixed; the pointers are stored in a table.
> + * - Consumer and producer part of same thread.
> + * - Multi-thread producers and consumers need locking.
> + * - Single/Bulk/burst dequeue at Tail or Head
> + * - Single/Bulk/burst enqueue at Head or Tail
> + *
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include 
> +#include 
> +
> +/**
> + * Calculate the memory size needed for a ST ring
> + *
> + * This function returns the number of bytes needed for a ST ring, given
> + * the number of elements in it. This value is the sum of the size of
> + * the structure rte_st_ring and the size of the memory needed by the
> + * elements. The value is aligned to a cache line size.
> + *
> + * @param count
> + *   The number of elements in the ring (must be a power of 2).
> + * @return
> + *   - The memory size needed for the ST ring on success.
> + *   - -EINVAL if count is not a power of 2.
> + */
> +ssize_t rte_st_ring_get_memsize(unsigned int count);
> +
> +/**
> + * Initialize a ST ring structure.
> + *
> + * Initialize a ST ring structure in memory pointed by "r". The size of the
> + * memory area must be large enough to store the ring structure and the
> + * object table. It is advised to use rte_st_ring_get_memsize() to get the
> + * appropriate size.
> + *
> + * The ST ring size is set to *count*, which must be a power of two.
> + * The real usable ring size is *count-1* instead of *count* to
> + * differentiate a full ring from an empty ring.
> + *
> + * The ring is not added in RTE_TAILQ_ST_RING global list. Indeed, the
> + * memory given by the caller may not be shareable among dpdk
> + * processes.
> + *
> + * @param r
> + *   The pointer to the ring structure followed by the elements table.
> + * @param name
> + *   The name of the ring.
> + * @param count
> + *   The number of elements in the ring (must be a power of 2,
> + *   unless RTE_ST_RING_F_EXACT_SZ is set in flags).
> + * @param flags
> + *   An OR of the following:
> + *   - RTE_ST_RING_F_EXACT_SZ: If this flag is set, the ring will hold
> + * exactly the requested number of entries, and the requested size
> + * will be rounded up to the next power of two, but the usable space
> + * will be exactly that requested. Worst case, if a power-of-2 size is
> + * requested, half the ring space will be wasted.
> + * Without this flag set, the ring size requested must be a power of 2,
> + * and the usable space will be that size - 1.
> + * @return
> + *   0 on success, or a negative value on error.
> + */
> +int rte_st_ring_init(struct rte_st_ring *r, const char *name,
> + unsigned int count, unsigned int flags);
> +
> +/**
> + * Create a new ST ring named *name* in memory.
> + *
> + * This function uses ``memzone_reserve()`` to allocate memory. Then it
> + * calls rte_st_ring_init() to initialize an empty ring.
> + *
> + * The new ring size is set to *count*, which must be a power of two.
> + * The real usable ring size is *count-1* instead of *count* to
> + * differentiate a full ring from an empty ring.
> + *
> + * The ring is added in RTE_TAILQ_ST_RING list.
> + *
> + * @param name
> + *   The name of the ring.
> + * @param count
> + *   The size of the ring (must be a pow

Re: [PATCH] usertools: suggest use of hwloc for new cpu

2023-09-04 Thread Bruce Richardson
On Mon, Sep 04, 2023 at 11:11:20AM +0100, Ferruh Yigit wrote:
> On 8/14/2023 10:25 AM, Konstantin Ananyev wrote:
> > 
> >> On Sun, Aug 13, 2023 at 08:52:01AM -0700, Stephen Hemminger wrote:
> >>> On Sun, 13 Aug 2023 02:12:03 +
> >>> "Varghese, Vipin"  wrote:
> >>>
> >
> > On Sat, 12 Aug 2023 06:27:20 +0530
> > Vipin Varghese  wrote:
> >
> >> Most modern processor now supports numa by partitioning NUMA based on
> >> CPU-IO & Last Level Cache within the same socket.
> >> As per the discussion in mailing list, suggesting the make use of
> >> hw-loc for such scenarios.
> >>
> >> Signed-off-by: Vipin Varghese 
> >
> > NAK, no scripting hwloc, it is ugly and creates a dependency that is 
> > not listed
> > in DPDK packaging.
> 
>  There is no calls to hwloc within in thescript. Hence not clear what 
>  does ` NAK, no scripting hwloc it is ugly and creates a
> >> dependency that is not listed in DPDK packaging.`.
> 
>  Requesting to cross check why NAK is shared for `print` as suggestion. 
>  Hence, I have disagree to this.
> >>>
> >>> Sorry, I misinterpreted what the print's were doing.
> >>> Better off not to list exact flags, the lstopo may change and user may 
> >>> want different
> >>> format anyway.
> >>>
> >>> How about something like this?
> >>>
> >>>
> >>>  doc/guides/rel_notes/deprecation.rst | 5 +
> >>>  usertools/cpu_layout.py  | 5 +
> >>>  2 files changed, 10 insertions(+)
> >>>
> >>> diff --git a/doc/guides/rel_notes/deprecation.rst 
> >>> b/doc/guides/rel_notes/deprecation.rst
> >>> index 317875c5054b..25a116900dfb 100644
> >>> --- a/doc/guides/rel_notes/deprecation.rst
> >>> +++ b/doc/guides/rel_notes/deprecation.rst
> >>> @@ -185,3 +185,8 @@ Deprecation Notices
> >>>will be deprecated and subsequently removed in DPDK 24.11 release.
> >>>Before this, the new port library API (functions rte_swx_port_*)
> >>>will gradually transition from experimental to stable status.
> >>> +
> >>> +* cpulayout: The CPU layout script is unable to deal with all the 
> >>> possible
> >>> +  complexities of modern CPU topology. Other existing tools offer more
> >>> +  features and do a better job with keeping up with innovations.
> >>> +  Therefore it will be deprecated and removed in a future release.
> >>
> >> Does the script really do that bad a job? While I can understand us looking
> >> to recommend alternatives, I actually find the script in it's current form
> >> really handy - much more so than working out the exact flags for lstopo
> >> etc. Since it's not a large maintenance burden, I'd request we keep it
> >> around - while still recommending lstopo to users.
> > 
> > +1
> > I do use it on regular basis.
> > It would be a pity if it will be gone.
> >
> 
> I also use it time to time and find it useful.
> 
> But it is not accurate/correct for some AMD platforms (for various NPS
> (Nodes per Socket) values).
> So either it needs to be updated/improved or replaced.
> 
> Vipin sent a patch [1] to update it but it is question how much of this
> logic belongs to DPDK, or should we rely on external tools dedicated for
> this purpose.
> 

I'd like to suggest that we take a slightly ambiguous position on this
script. Specifically:

I think we should "recommend" but not "rely on" external tools for this.
Specifically, I think that recommending use of hwloc is the best thing to
do as it's better maintained and packaged for windows. However, for quick
use in many situations, cpu_layout does the job as well or better in terms
of simplicity of use and output.

/Bruce


RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode

2023-09-04 Thread Konstantin Ananyev



> > > > > > > > Define specific function implementation for i40e driver.
> > > > > > > > Currently, mbufs recycle mode can support 128bit vector path
> > > > > > > > and
> > > > > > > > avx2
> > > > > > path.
> > > > > > > > And can be enabled both in fast free and no fast free mode.
> > > > > > > >
> > > > > > > > Suggested-by: Honnappa Nagarahalli
> > > > > > > > 
> > > > > > > > Signed-off-by: Feifei Wang 
> > > > > > > > Reviewed-by: Ruifeng Wang 
> > > > > > > > Reviewed-by: Honnappa Nagarahalli
> > > > 
> > > > > > > > ---
> > > > > > > >  drivers/net/i40e/i40e_ethdev.c|   1 +
> > > > > > > >  drivers/net/i40e/i40e_ethdev.h|   2 +
> > > > > > > >  .../net/i40e/i40e_recycle_mbufs_vec_common.c  | 147
> > > > > > > > ++
> > > > > > > >  drivers/net/i40e/i40e_rxtx.c  |  32 
> > > > > > > >  drivers/net/i40e/i40e_rxtx.h  |   4 +
> > > > > > > >  drivers/net/i40e/meson.build  |   1 +
> > > > > > > >  6 files changed, 187 insertions(+)  create mode 100644
> > > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > b/drivers/net/i40e/i40e_ethdev.c index
> > > > > > > > 8271bbb394..50ba9aac94
> > > > > > > > 100644
> > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops
> > > > > > > > i40e_eth_dev_ops
> > > > > > = {
> > > > > > > > .flow_ops_get = i40e_dev_flow_ops_get,
> > > > > > > > .rxq_info_get = i40e_rxq_info_get,
> > > > > > > > .txq_info_get = i40e_txq_info_get,
> > > > > > > > +   .recycle_rxq_info_get = 
> > > > > > > > i40e_recycle_rxq_info_get,
> > > > > > > > .rx_burst_mode_get= i40e_rx_burst_mode_get,
> > > > > > > > .tx_burst_mode_get= i40e_tx_burst_mode_get,
> > > > > > > > .timesync_enable  = i40e_timesync_enable,
> > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > b/drivers/net/i40e/i40e_ethdev.h index
> > > > > > > > 6f65d5e0ac..af758798e1
> > > > > > > > 100644
> > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct
> > > > > > > > rte_eth_dev *dev, uint16_t queue_id,
> > > > > > > > struct rte_eth_rxq_info *qinfo);  void
> > > > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
> > > > > > > > struct rte_eth_txq_info *qinfo);
> > > > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev,
> > > > > > > > +uint16_t
> > > > > > queue_id,
> > > > > > > > +   struct rte_eth_recycle_rxq_info *recycle_rxq_info);
> > > > > > > >  int i40e_rx_burst_mode_get(struct rte_eth_dev *dev,
> > > > > > > > uint16_t
> > > > queue_id,
> > > > > > > >struct rte_eth_burst_mode *mode);  
> > > > > > > > int
> > > > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t
> > > > > > > > queue_id, diff -- git
> > > > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > new file mode 100644
> > > > > > > > index 00..5663ecccde
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > @@ -0,0 +1,147 @@
> > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > > + * Copyright (c) 2023 Arm Limited.
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +#include 
> > > > > > > > +#include 
> > > > > > > > +
> > > > > > > > +#include "base/i40e_prototype.h"
> > > > > > > > +#include "base/i40e_type.h"
> > > > > > > > +#include "i40e_ethdev.h"
> > > > > > > > +#include "i40e_rxtx.h"
> > > > > > > > +
> > > > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual"
> > > > > > > > +
> > > > > > > > +void
> > > > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue,
> > > > > > > > +uint16_t
> > > > > > > > +nb_mbufs) {
> > > > > > > > +   struct i40e_rx_queue *rxq = rx_queue;
> > > > > > > > +   struct i40e_rx_entry *rxep;
> > > > > > > > +   volatile union i40e_rx_desc *rxdp;
> > > > > > > > +   uint16_t rx_id;
> > > > > > > > +   uint64_t paddr;
> > > > > > > > +   uint64_t dma_addr;
> > > > > > > > +   uint16_t i;
> > > > > > > > +
> > > > > > > > +   rxdp = rxq->rx_ring + rxq->rxrearm_start;
> > > > > > > > +   rxep = &rxq->sw_ring[rxq->rxrearm_start];
> > > > > > > > +
> > > > > > > > +   for (i = 0; i < nb_mbufs; i++) {
> > > > > > > > +   /* Initialize rxdp descs. */
> > > > > > > > +   paddr = (rxep[i].mbuf)->buf_iova +
> > > > > > > > RTE_PKTMBUF_HEADROOM;
> > > > > > > > +   dma_add

[dpdk-dev] [PATCH 1/2] common/cnxk: limit RSS key config with RTE Flow operations

2023-09-04 Thread psatheesh
From: Kiran Kumar K 

Limit the configuring RSS key with RTE Flow operations for cnxk
device. Key can be update only with dev operations using
rte_eth_dev_rss_hash_update.

Signed-off-by: Kiran Kumar K 
Reviewed-by: Jerin Jacob 
---
 drivers/common/cnxk/roc_npc.c | 33 ++---
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/common/cnxk/roc_npc.c b/drivers/common/cnxk/roc_npc.c
index c0c168db76..0611a83bba 100644
--- a/drivers/common/cnxk/roc_npc.c
+++ b/drivers/common/cnxk/roc_npc.c
@@ -943,9 +943,35 @@ npc_rss_action_configure(struct roc_npc *roc_npc,
uint8_t key[ROC_NIX_RSS_KEY_LEN];
const uint8_t *key_ptr;
uint8_t flowkey_algx;
+   uint32_t key_len;
uint16_t *reta;
int rc;
 
+   roc_nix_rss_key_get(roc_nix, key);
+   if (rss->key == NULL) {
+   key_ptr = key;
+   } else {
+   key_len = rss->key_len;
+   if (key_len > ROC_NIX_RSS_KEY_LEN)
+   key_len = ROC_NIX_RSS_KEY_LEN;
+
+   for (i = 0; i < key_len; i++) {
+   if (key[i] != rss->key[i]) {
+   plt_err("RSS key config not supported");
+   plt_err("New Key:");
+   for (i = 0; i < key_len; i++)
+   plt_dump_no_nl("0x%.2x ", rss->key[i]);
+   plt_dump_no_nl("\n");
+   plt_err("Configured Key:");
+   for (i = 0; i < ROC_NIX_RSS_KEY_LEN; i++)
+   plt_dump_no_nl("0x%.2x ", key[i]);
+   plt_dump_no_nl("\n");
+   return -ENOTSUP;
+   }
+   }
+   key_ptr = rss->key;
+   }
+
rc = npc_rss_free_grp_get(npc, &rss_grp_idx);
/* RSS group :0 is not usable for flow rss action */
if (rc < 0 || rss_grp_idx == 0)
@@ -960,13 +986,6 @@ npc_rss_action_configure(struct roc_npc *roc_npc,
 
*rss_grp = rss_grp_idx;
 
-   if (rss->key == NULL) {
-   roc_nix_rss_key_default_fill(roc_nix, key);
-   key_ptr = key;
-   } else {
-   key_ptr = rss->key;
-   }
-
roc_nix_rss_key_set(roc_nix, key_ptr);
 
/* If queue count passed in the rss action is less than
-- 
2.39.2



[dpdk-dev] [PATCH 2/2] drivers: add represented port flow item for cnxk

2023-09-04 Thread psatheesh
From: Kiran Kumar K 

Adding support for represented port flow item for cnxk device.

Signed-off-by: Kiran Kumar K 
Reviewed-by: Satheesh Paul 
---
 doc/guides/nics/features/cnxk.ini|   1 +
 doc/guides/nics/features/cnxk_vf.ini |   1 +
 drivers/common/cnxk/roc_npc.c|  59 ---
 drivers/common/cnxk/roc_npc.h|  13 +++-
 drivers/common/cnxk/roc_npc_mcam.c   |  72 +-
 drivers/common/cnxk/roc_npc_parse.c  |  11 +++
 drivers/common/cnxk/roc_npc_priv.h   |   1 +
 drivers/net/cnxk/cnxk_flow.c | 106 ++-
 8 files changed, 166 insertions(+), 98 deletions(-)

diff --git a/doc/guides/nics/features/cnxk.ini 
b/doc/guides/nics/features/cnxk.ini
index ac7de9a0f0..1c7d804aab 100644
--- a/doc/guides/nics/features/cnxk.ini
+++ b/doc/guides/nics/features/cnxk.ini
@@ -72,6 +72,7 @@ mark = Y
 mpls = Y
 nvgre= Y
 raw  = Y
+represented_port = Y
 sctp = Y
 tcp  = Y
 tx_queue = Y
diff --git a/doc/guides/nics/features/cnxk_vf.ini 
b/doc/guides/nics/features/cnxk_vf.ini
index b03e8b35c3..96bfd86ac8 100644
--- a/doc/guides/nics/features/cnxk_vf.ini
+++ b/doc/guides/nics/features/cnxk_vf.ini
@@ -63,6 +63,7 @@ mark = Y
 mpls = Y
 nvgre= Y
 raw  = Y
+represented_port = Y
 sctp = Y
 tcp  = Y
 tx_queue = Y
diff --git a/drivers/common/cnxk/roc_npc.c b/drivers/common/cnxk/roc_npc.c
index 0611a83bba..0d7253bf1a 100644
--- a/drivers/common/cnxk/roc_npc.c
+++ b/drivers/common/cnxk/roc_npc.c
@@ -499,6 +499,8 @@ npc_parse_actions(struct roc_npc *roc_npc, const struct 
roc_npc_attr *attr,
flow->ctr_id = NPC_COUNTER_NONE;
flow->mtr_id = ROC_NIX_MTR_ID_INVALID;
pf_func = npc->pf_func;
+   if (flow->has_rep)
+   pf_func = flow->rep_pf_func;
 
for (; actions->type != ROC_NPC_ACTION_TYPE_END; actions++) {
switch (actions->type) {
@@ -794,10 +796,14 @@ npc_parse_pattern(struct npc *npc, const struct 
roc_npc_item_info pattern[],
  struct roc_npc_flow *flow, struct npc_parse_state *pst)
 {
npc_parse_stage_func_t parse_stage_funcs[] = {
-   npc_parse_meta_items, npc_parse_mark_item, npc_parse_pre_l2, 
npc_parse_cpt_hdr,
-   npc_parse_higig2_hdr, npc_parse_tx_queue,  npc_parse_la, 
npc_parse_lb,
-   npc_parse_lc, npc_parse_ld,npc_parse_le, 
npc_parse_lf,
-   npc_parse_lg, npc_parse_lh,
+   npc_parse_meta_items, npc_parse_port_representor_id,
+   npc_parse_mark_item,  npc_parse_pre_l2,
+   npc_parse_cpt_hdr,npc_parse_higig2_hdr,
+   npc_parse_tx_queue,   npc_parse_la,
+   npc_parse_lb, npc_parse_lc,
+   npc_parse_ld, npc_parse_le,
+   npc_parse_lf, npc_parse_lg,
+   npc_parse_lh,
};
uint8_t layer = 0;
int key_offset;
@@ -1036,15 +1042,18 @@ npc_rss_action_program(struct roc_npc *roc_npc,
   struct roc_npc_flow *flow)
 {
const struct roc_npc_action_rss *rss;
+   struct roc_npc *npc = roc_npc;
uint32_t rss_grp;
uint8_t alg_idx;
int rc;
 
+   if (flow->has_rep)
+   npc = roc_npc->rep_npc;
+
for (; actions->type != ROC_NPC_ACTION_TYPE_END; actions++) {
if (actions->type == ROC_NPC_ACTION_TYPE_RSS) {
rss = (const struct roc_npc_action_rss *)actions->conf;
-   rc = npc_rss_action_configure(roc_npc, rss, &alg_idx,
- &rss_grp, flow->mcam_id);
+   rc = npc_rss_action_configure(npc, rss, &alg_idx, 
&rss_grp, flow->mcam_id);
if (rc)
return rc;
 
@@ -1067,7 +1076,7 @@ npc_vtag_cfg_delete(struct roc_npc *roc_npc, struct 
roc_npc_flow *flow)
struct roc_nix *roc_nix = roc_npc->roc_nix;
struct nix_vtag_config *vtag_cfg;
struct nix_vtag_config_rsp *rsp;
-   struct mbox *mbox;
+   struct mbox *mbox, *ombox;
struct nix *nix;
int rc = 0;
 
@@ -1077,7 +1086,10 @@ npc_vtag_cfg_delete(struct roc_npc *roc_npc, struct 
roc_npc_flow *flow)
} tx_vtag_action;
 
nix = roc_nix_to_nix_priv(roc_nix);
-   mbox = mbox_get((&nix->dev)->mbox);
+   ombox = (&nix->dev)->mbox;
+   if (flow->has_rep)
+   ombox = flow->rep_mbox;
+   mbox = mbox_get(ombox);
 
tx_vtag_action.reg = flow->vtag_action;
vtag_cfg = mbox_alloc_msg_nix_vtag_cfg(mbox);
@@ -1328,6 +1340,8 @@ npc_vtag_action_program(struct roc_npc *roc_npc,
 
nix = roc_nix_to_nix_priv(roc_nix);
mbox = (&nix->dev)->mbox;
+   if (flow->has_rep)
+

Re: [PATCH] eal: add cache guard to per-lcore PRNG state

2023-09-04 Thread Mattias Rönnblom

On 2023-09-04 11:26, Morten Brørup wrote:

The per-lcore random state is frequently updated by their individual
lcores, so add a cache guard to prevent CPU cache thrashing.



"to prevent false sharing in case the CPU employs a next-N-lines (or 
similar) hardware prefetcher"


In my world, cache trashing and cache line contention are two different 
things.


Other than that,
Acked-by: Mattias Rönnblom 


Depends-on: series-29415 ("clarify purpose of empty cache lines")

Signed-off-by: Morten Brørup 
---
  lib/eal/common/rte_random.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
index 565f2401ce..3df0c7004a 100644
--- a/lib/eal/common/rte_random.c
+++ b/lib/eal/common/rte_random.c
@@ -18,6 +18,7 @@ struct rte_rand_state {
uint64_t z3;
uint64_t z4;
uint64_t z5;
+   RTE_CACHE_GUARD;
  } __rte_cache_aligned;
  
  /* One instance each for every lcore id-equipped thread, and one


Re: [RFC] cache guard

2023-09-04 Thread Mattias Rönnblom

On 2023-09-01 20:52, Morten Brørup wrote:

From: Mattias Rönnblom [mailto:hof...@lysator.liu.se]
Sent: Friday, 1 September 2023 18.58

On 2023-09-01 14:26, Thomas Monjalon wrote:

27/08/2023 10:34, Morten Brørup:

+CC Honnappa and Konstantin, Ring lib maintainers
+CC Mattias, PRNG lib maintainer


From: Bruce Richardson [mailto:bruce.richard...@intel.com]
Sent: Friday, 25 August 2023 11.24

On Fri, Aug 25, 2023 at 11:06:01AM +0200, Morten Brørup wrote:

+CC mempool maintainers


From: Bruce Richardson [mailto:bruce.richard...@intel.com]
Sent: Friday, 25 August 2023 10.23

On Fri, Aug 25, 2023 at 08:45:12AM +0200, Morten Brørup wrote:

Bruce,

With this patch [1], it is noted that the ring producer and

consumer data

should not be on adjacent cache lines, for performance reasons.


[1]:





https://git.dpdk.org/dpdk/commit/lib/librte_ring/rte_ring.h?id=d9f0d3a1f

fd4b66

e75485cc8b63b9aedfbdfe8b0


(It's obvious that they cannot share the same cache line, because

they are

accessed by two different threads.)


Intuitively, I would think that having them on different cache

lines would

suffice. Why does having an empty cache line between them make a

difference?


And does it need to be an empty cache line? Or does it suffice

having the

second structure start at two cache lines after the start of the

first

structure (e.g. if the size of the first structure is two cache

lines)?


I'm asking because the same principle might apply to other code

too.



Hi Morten,

this was something we discovered when working on the distributor

library.

If we have cachelines per core where there is heavy access, having

some

cachelines as a gap between the content cachelines can help

performance. We

believe this helps due to avoiding issues with the HW prefetchers

(e.g.

adjacent cacheline prefetcher) bringing in the second cacheline
speculatively when an operation is done on the first line.


I guessed that it had something to do with speculative prefetching,

but wasn't sure. Good to get confirmation, and that it has a

measureable

effect somewhere. Very interesting!


NB: More comments in the ring lib about stuff like this would be

nice.


So, for the mempool lib, what do you think about applying the same

technique to the rte_mempool_debug_stats structure (which is an

array

indexed per lcore)... Two adjacent lcores heavily accessing their

local

mempool caches seems likely to me. But how heavy does the access

need to

be for this technique to be relevant?




No idea how heavy the accesses need to be for this to have a

noticable

effect. For things like debug stats, I wonder how worthwhile making

such

a
change would be, but then again, any change would have very low

impact

too
in that case.


I just tried adding padding to some of the hot structures in our own

application, and observed a significant performance improvement for
those.


So I think this technique should have higher visibility in DPDK by

adding a new cache macro to rte_common.h:


+1 to make more visibility in doc and adding a macro, good idea!





A worry I have is that for CPUs with large (in this context) N, you will
end up with a lot of padding to avoid next-N-lines false sharing. That
would be padding after, and in the general (non-array) case also before,
the actual per-lcore data. A slight nuisance is also that those
prefetched lines of padding, will never contain anything useful, and
thus fetching them will always be a waste.


Out of curiosity, what is the largest N anyone here on the list is aware of?



Padding/alignment may not be the only way to avoid HW-prefetcher-induced
false sharing for per-lcore data structures.

What we are discussing here is organizing the statically allocated
per-lcore structs of a particular module in an array with the
appropriate padding/alignment. In this model, all data related to a
particular module is close (memory address/page-wise), but not so close
to cause false sharing.

/* rte_a.c */

struct rte_a_state
{
int x;
  RTE_CACHE_GUARD;
} __rte_cache_aligned;

static struct rte_a_state a_states[RTE_MAX_LCORE];

/* rte_b.c */

struct rte_b_state
{
char y;
  char z;
  RTE_CACHE_GUARD;
} __rte_cache_aligned;


static struct rte_b_state b_states[RTE_MAX_LCORE];

What you would end up with in runtime when the linker has done its job
is something that essentially looks like this (in memory):

struct {
struct rte_a_state a_states[RTE_MAX_LCORE];
struct rte_b_state b_states[RTE_MAX_LCORE];
};

You could consider turning it around, and keeping data (i.e., module
structs) related to a particular lcore, for all modules, close. In other
words, keeping a per-lcore arrays of variable-sized elements.

So, something that will end up looking like this (in memory, not in the
source code):

struct rte_lcore_state
{
struct rte_a_state a_state;
struct rte_b_state b_state;
  RTE_CACHE_GUARD;
};

struct rte_lcore_state lcore_state

RE: [RFC] cache guard

2023-09-04 Thread Morten Brørup
> From: Mattias Rönnblom [mailto:hof...@lysator.liu.se]
> Sent: Monday, 4 September 2023 14.07
> 
> On 2023-09-01 20:52, Morten Brørup wrote:
> >> From: Mattias Rönnblom [mailto:hof...@lysator.liu.se]
> >> Sent: Friday, 1 September 2023 18.58
> >>
> >> On 2023-09-01 14:26, Thomas Monjalon wrote:
> >>> 27/08/2023 10:34, Morten Brørup:
>  +CC Honnappa and Konstantin, Ring lib maintainers
>  +CC Mattias, PRNG lib maintainer
> 
> > From: Bruce Richardson [mailto:bruce.richard...@intel.com]
> > Sent: Friday, 25 August 2023 11.24
> >
> > On Fri, Aug 25, 2023 at 11:06:01AM +0200, Morten Brørup wrote:
> >> +CC mempool maintainers
> >>
> >>> From: Bruce Richardson [mailto:bruce.richard...@intel.com]
> >>> Sent: Friday, 25 August 2023 10.23
> >>>
> >>> On Fri, Aug 25, 2023 at 08:45:12AM +0200, Morten Brørup wrote:
>  Bruce,
> 
>  With this patch [1], it is noted that the ring producer and
> > consumer data
> >>> should not be on adjacent cache lines, for performance reasons.
> 
>  [1]:
> >>>
> >
> >>
> https://git.dpdk.org/dpdk/commit/lib/librte_ring/rte_ring.h?id=d9f0d3a1f
> > fd4b66
> >>> e75485cc8b63b9aedfbdfe8b0
> 
>  (It's obvious that they cannot share the same cache line,
> because
> > they are
> >>> accessed by two different threads.)
> 
>  Intuitively, I would think that having them on different cache
> > lines would
> >>> suffice. Why does having an empty cache line between them make a
> > difference?
> 
>  And does it need to be an empty cache line? Or does it suffice
> > having the
> >>> second structure start at two cache lines after the start of the
> > first
> >>> structure (e.g. if the size of the first structure is two cache
> > lines)?
> 
>  I'm asking because the same principle might apply to other code
> > too.
> 
> >>> Hi Morten,
> >>>
> >>> this was something we discovered when working on the distributor
> > library.
> >>> If we have cachelines per core where there is heavy access,
> having
> > some
> >>> cachelines as a gap between the content cachelines can help
> > performance. We
> >>> believe this helps due to avoiding issues with the HW
> prefetchers
> > (e.g.
> >>> adjacent cacheline prefetcher) bringing in the second cacheline
> >>> speculatively when an operation is done on the first line.
> >>
> >> I guessed that it had something to do with speculative
> prefetching,
> > but wasn't sure. Good to get confirmation, and that it has a
> >> measureable
> > effect somewhere. Very interesting!
> >>
> >> NB: More comments in the ring lib about stuff like this would be
> >> nice.
> >>
> >> So, for the mempool lib, what do you think about applying the
> same
> > technique to the rte_mempool_debug_stats structure (which is an
> >> array
> > indexed per lcore)... Two adjacent lcores heavily accessing their
> >> local
> > mempool caches seems likely to me. But how heavy does the access
> >> need to
> > be for this technique to be relevant?
> >>
> >
> > No idea how heavy the accesses need to be for this to have a
> >> noticable
> > effect. For things like debug stats, I wonder how worthwhile
> making
> >> such
> > a
> > change would be, but then again, any change would have very low
> >> impact
> > too
> > in that case.
> 
>  I just tried adding padding to some of the hot structures in our
> own
> >> application, and observed a significant performance improvement for
> >> those.
> 
>  So I think this technique should have higher visibility in DPDK by
> >> adding a new cache macro to rte_common.h:
> >>>
> >>> +1 to make more visibility in doc and adding a macro, good idea!
> >>>
> >>>
> >>>
> >>
> >> A worry I have is that for CPUs with large (in this context) N, you
> will
> >> end up with a lot of padding to avoid next-N-lines false sharing.
> That
> >> would be padding after, and in the general (non-array) case also
> before,
> >> the actual per-lcore data. A slight nuisance is also that those
> >> prefetched lines of padding, will never contain anything useful, and
> >> thus fetching them will always be a waste.
> >
> > Out of curiosity, what is the largest N anyone here on the list is
> aware of?
> >
> >>
> >> Padding/alignment may not be the only way to avoid HW-prefetcher-
> induced
> >> false sharing for per-lcore data structures.
> >>
> >> What we are discussing here is organizing the statically allocated
> >> per-lcore structs of a particular module in an array with the
> >> appropriate padding/alignment. In this model, all data related to a
> >> particular module is close (memory address/page-wise), but not so
> close
> >> to cause false sharing.
> >>
> >> /* rte_a.c */
> >>
> >> struct rte_a_state
> >> {
> >>int x;

Re: ring name size

2023-09-04 Thread Thomas Monjalon
04/09/2023 11:33, Morten Brørup:
> > From: Thomas Monjalon [mailto:tho...@monjalon.net]
> > Sent: Monday, 4 September 2023 11.31
> > 
> > Hello,
> > 
> > When creating a ring, we need to know the maximum length of the name.
> > It seems this value is not documented.
> > And there is no constant defined.
> 
> It is defined (with a description) here:
> https://elixir.bootlin.com/dpdk/latest/source/lib/ring/rte_ring_core.h#L52

Indeed, thank you Morten. I missed it when looking simply at rte_ring.h.
So we are just missing to mention it in the doxygen of the related functions.

> > There is only RTE_MEMZONE_NAMESIZE.
> > 
> > Should we document the maximum length as 31 and add a constant?
> > #define RTE_RING_NAME_SIZE RTE_MEMZONE_NAMESIZE





[PATCH v3 0/3] Add dispatcher library

2023-09-04 Thread Mattias Rönnblom
The purpose of the dispatcher library is to decouple different parts
of an eventdev-based application (e.g., processing pipeline stages),
sharing the same underlying event device.

The dispatcher replaces the conditional logic (often, a switch
statement) that typically follows an event device dequeue operation,
where events are dispatched to different parts of the application
based on event meta data, such as the queue id or scheduling type.

The concept is similar to a UNIX file descriptor event loop library.
Instead of tying callback functions to fds as for example libevent
does, the dispatcher relies on application-supplied matching callback
functions to decide where to deliver events.

A dispatcher is configured to dequeue events from a specific event
device, and ties into the service core framework, to do its (and the
application's) work.

The dispatcher provides a convenient way for an eventdev-based
application to use service cores for application-level processing, and
thus for sharing those cores with other DPDK services.

Although the dispatcher adds some overhead, experience suggests that
the net effect on the application (both synthetic benchmarks and more
real-world applications) may well be positive. This is primarily due
to clustering (see programming guide) reducing cache misses.

Benchmarking indicates that the overhead is ~10 cc/event (on a
large core), with a handful of often-used handlers.

The dispatcher does not support run-time reconfiguration.

The use of the dispatcher library is optional, and an eventdev-based
application may still opt to access the event device using direct
eventdev API calls, or by some other means.

Mattias Rönnblom (3):
  lib: introduce dispatcher library
  test: add dispatcher test suite
  doc: add dispatcher programming guide

 MAINTAINERS  |5 +
 app/test/meson.build |1 +
 app/test/test_dispatcher.c   | 1054 ++
 doc/api/doxy-api-index.md|1 +
 doc/api/doxy-api.conf.in |1 +
 doc/guides/prog_guide/dispatcher_lib.rst |  434 +
 doc/guides/prog_guide/index.rst  |1 +
 lib/dispatcher/meson.build   |   17 +
 lib/dispatcher/rte_dispatcher.c  |  791 
 lib/dispatcher/rte_dispatcher.h  |  480 ++
 lib/dispatcher/version.map   |   20 +
 lib/meson.build  |2 +
 12 files changed, 2807 insertions(+)
 create mode 100644 app/test/test_dispatcher.c
 create mode 100644 doc/guides/prog_guide/dispatcher_lib.rst
 create mode 100644 lib/dispatcher/meson.build
 create mode 100644 lib/dispatcher/rte_dispatcher.c
 create mode 100644 lib/dispatcher/rte_dispatcher.h
 create mode 100644 lib/dispatcher/version.map

-- 
2.34.1



[PATCH v3 1/3] lib: introduce dispatcher library

2023-09-04 Thread Mattias Rönnblom
The purpose of the dispatcher library is to help reduce coupling in an
Eventdev-based DPDK application.

In addition, the dispatcher also provides a convenient and flexible
way for the application to use service cores for application-level
processing.

Signed-off-by: Mattias Rönnblom 
Tested-by: Peter Nilsson 
Reviewed-by: Heng Wang 

--

PATCH v3:
 o To underline its optional character and since it does not provide
   hardware abstraction, the event dispatcher is now a separate
   library.
 o Change name from rte_event_dispatcher -> rte_dispatcher, to make it
   shorter and to avoid the rte_event_* namespace.

PATCH v2:
 o Add dequeue batch count statistic.
 o Add statistics reset function to API.
 o Clarify MT safety guarantees (or lack thereof) in the API documentation.
 o Change loop variable type in evd_lcore_get_handler_by_id() to uint16_t,
   to be consistent with similar loops elsewhere in the dispatcher.
 o Fix variable names in finalizer unregister function.

PATCH:
 o Change prefix from RED to EVD, to avoid confusion with random
   early detection.

RFC v4:
 o Move handlers to per-lcore data structures.
 o Introduce mechanism which rearranges handlers so that often-used
   handlers tend to be tried first.
 o Terminate dispatch loop in case all events are delivered.
 o To avoid the dispatcher's service function hogging the CPU, process
   only one batch per call.
 o Have service function return -EAGAIN if no work is performed.
 o Events delivered in the process function is no longer marked 'const',
   since modifying them may be useful for the application and cause
   no difficulties for the dispatcher.
 o Various minor API documentation improvements.

RFC v3:
 o Add stats_get() function to the version.map file.
---
 MAINTAINERS |   3 +
 lib/dispatcher/meson.build  |  17 +
 lib/dispatcher/rte_dispatcher.c | 791 
 lib/dispatcher/rte_dispatcher.h | 480 +++
 lib/dispatcher/version.map  |  20 +
 lib/meson.build |   2 +
 6 files changed, 1313 insertions(+)
 create mode 100644 lib/dispatcher/meson.build
 create mode 100644 lib/dispatcher/rte_dispatcher.c
 create mode 100644 lib/dispatcher/rte_dispatcher.h
 create mode 100644 lib/dispatcher/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index a926155f26..6704cd5b2c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1726,6 +1726,9 @@ M: Nithin Dabilpuram 
 M: Pavan Nikhilesh 
 F: lib/node/
 
+Dispatcher - EXPERIMENTAL
+M: Mattias Rönnblom 
+F: lib/dispatcher/
 
 Test Applications
 -
diff --git a/lib/dispatcher/meson.build b/lib/dispatcher/meson.build
new file mode 100644
index 00..c6054a3a5d
--- /dev/null
+++ b/lib/dispatcher/meson.build
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2023 Ericsson AB
+
+if is_windows
+build = false
+reason = 'not supported on Windows'
+subdir_done()
+endif
+
+sources = files(
+'rte_dispatcher.c',
+)
+headers = files(
+'rte_dispatcher.h',
+)
+
+deps += ['eventdev']
diff --git a/lib/dispatcher/rte_dispatcher.c b/lib/dispatcher/rte_dispatcher.c
new file mode 100644
index 00..3319fe09f2
--- /dev/null
+++ b/lib/dispatcher/rte_dispatcher.c
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Ericsson AB
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "eventdev_pmd.h"
+
+#include 
+
+#define EVD_MAX_PORTS_PER_LCORE 4
+#define EVD_MAX_HANDLERS 32
+#define EVD_MAX_FINALIZERS 16
+#define EVD_AVG_PRIO_INTERVAL 2000
+
+struct rte_dispatcher_lcore_port {
+   uint8_t port_id;
+   uint16_t batch_size;
+   uint64_t timeout;
+};
+
+struct rte_dispatcher_handler {
+   int id;
+   rte_dispatcher_match_t match_fun;
+   void *match_data;
+   rte_dispatcher_process_t process_fun;
+   void *process_data;
+};
+
+struct rte_dispatcher_finalizer {
+   int id;
+   rte_dispatcher_finalize_t finalize_fun;
+   void *finalize_data;
+};
+
+struct rte_dispatcher_lcore {
+   uint8_t num_ports;
+   uint16_t num_handlers;
+   int32_t prio_count;
+   struct rte_dispatcher_lcore_port ports[EVD_MAX_PORTS_PER_LCORE];
+   struct rte_dispatcher_handler handlers[EVD_MAX_HANDLERS];
+   struct rte_dispatcher_stats stats;
+} __rte_cache_aligned;
+
+struct rte_dispatcher {
+   uint8_t id;
+   uint8_t event_dev_id;
+   int socket_id;
+   uint32_t service_id;
+   struct rte_dispatcher_lcore lcores[RTE_MAX_LCORE];
+   uint16_t num_finalizers;
+   struct rte_dispatcher_finalizer finalizers[EVD_MAX_FINALIZERS];
+};
+
+static struct rte_dispatcher *dispatchers[UINT8_MAX];
+
+static bool
+evd_has_dispatcher(uint8_t id)
+{
+   return dispatchers[id] != NULL;
+}
+
+static struct rte_dispatcher *
+evd_get_dispatcher(uint8_t id)
+{
+   return dispatchers[id];
+}
+
+static void
+evd_set_dispatcher(uint8_t id, struct rte_di

[PATCH v3 2/3] test: add dispatcher test suite

2023-09-04 Thread Mattias Rönnblom
Add unit tests for the dispatcher.

--
PATCH v3:
 o Adapt the test suite to dispatcher API name changes.

PATCH v2:
 o Test finalize callback functionality.
 o Test handler and finalizer count upper limits.
 o Add statistics reset test.
 o Make sure dispatcher supply the proper event dev id and port id back
   to the application.

PATCH:
 o Extend test to cover often-used handler optimization feature.

RFC v4:
 o Adapt to non-const events in process function prototype.

Signed-off-by: Mattias Rönnblom 
---
 MAINTAINERS|1 +
 app/test/meson.build   |1 +
 app/test/test_dispatcher.c | 1054 
 3 files changed, 1056 insertions(+)
 create mode 100644 app/test/test_dispatcher.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 6704cd5b2c..43890cad0e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1729,6 +1729,7 @@ F: lib/node/
 Dispatcher - EXPERIMENTAL
 M: Mattias Rönnblom 
 F: lib/dispatcher/
+F: app/test/test_dispatcher.c
 
 Test Applications
 -
diff --git a/app/test/meson.build b/app/test/meson.build
index 05bae9216d..3303c73817 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -55,6 +55,7 @@ source_file_deps = {
 'test_cycles.c': [],
 'test_debug.c': [],
 'test_devargs.c': ['kvargs'],
+'test_dispatcher.c': ['dispatcher'],
 'test_distributor.c': ['distributor'],
 'test_distributor_perf.c': ['distributor'],
 'test_dmadev.c': ['dmadev', 'bus_vdev'],
diff --git a/app/test/test_dispatcher.c b/app/test/test_dispatcher.c
new file mode 100644
index 00..b64103c48e
--- /dev/null
+++ b/app/test/test_dispatcher.c
@@ -0,0 +1,1054 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Ericsson AB
+ */
+
+#include "test.h"
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NUM_WORKERS 3
+
+#define NUM_PORTS (NUM_WORKERS + 1)
+#define WORKER_PORT_ID(worker_idx) (worker_idx)
+#define DRIVER_PORT_ID (NUM_PORTS - 1)
+
+#define NUM_SERVICE_CORES NUM_WORKERS
+
+/* Eventdev */
+#define NUM_QUEUES 8
+#define LAST_QUEUE_ID (NUM_QUEUES - 1)
+#define MAX_EVENTS 4096
+#define NEW_EVENT_THRESHOLD (MAX_EVENTS / 2)
+#define DEQUEUE_BURST_SIZE 32
+#define ENQUEUE_BURST_SIZE 32
+
+#define NUM_EVENTS 1000
+#define NUM_FLOWS 16
+
+#define DSW_VDEV "event_dsw0"
+
+struct app_queue {
+   uint8_t queue_id;
+   uint64_t sn[NUM_FLOWS];
+   int dispatcher_reg_id;
+};
+
+struct cb_count {
+   uint8_t expected_event_dev_id;
+   uint8_t expected_event_port_id[RTE_MAX_LCORE];
+   atomic_int count;
+};
+
+struct test_app {
+   uint8_t event_dev_id;
+   uint8_t dispatcher_id;
+   uint32_t dispatcher_service_id;
+
+   unsigned int service_lcores[NUM_SERVICE_CORES];
+
+   int never_match_reg_id;
+   uint64_t never_match_count;
+   struct cb_count never_process_count;
+
+   struct app_queue queues[NUM_QUEUES];
+
+   int finalize_reg_id;
+   struct cb_count finalize_count;
+
+   bool running;
+
+   atomic_int completed_events;
+   atomic_int errors;
+};
+
+#define RETURN_ON_ERROR(rc) \
+   do {\
+   if (rc != TEST_SUCCESS) \
+   return rc;  \
+   } while (0)
+
+static struct test_app *
+test_app_create(void)
+{
+   int i;
+   struct test_app *app;
+
+   app = calloc(1, sizeof(struct test_app));
+
+   if (app == NULL)
+   return NULL;
+
+   for (i = 0; i < NUM_QUEUES; i++)
+   app->queues[i].queue_id = i;
+
+   return app;
+}
+
+static void
+test_app_free(struct test_app *app)
+{
+   free(app);
+}
+
+static int
+test_app_create_vdev(struct test_app *app)
+{
+   int rc;
+
+   rc = rte_vdev_init(DSW_VDEV, NULL);
+   if (rc < 0)
+   return TEST_SKIPPED;
+
+   rc = rte_event_dev_get_dev_id(DSW_VDEV);
+
+   app->event_dev_id = (uint8_t)rc;
+
+   return TEST_SUCCESS;
+}
+
+static int
+test_app_destroy_vdev(struct test_app *app)
+{
+   int rc;
+
+   rc = rte_event_dev_close(app->event_dev_id);
+   TEST_ASSERT_SUCCESS(rc, "Error while closing event device");
+
+   rc = rte_vdev_uninit(DSW_VDEV);
+   TEST_ASSERT_SUCCESS(rc, "Error while uninitializing virtual device");
+
+   return TEST_SUCCESS;
+}
+
+static int
+test_app_setup_event_dev(struct test_app *app)
+{
+   int rc;
+   int i;
+
+   rc = test_app_create_vdev(app);
+   if (rc < 0)
+   return rc;
+
+   struct rte_event_dev_config config = {
+   .nb_event_queues = NUM_QUEUES,
+   .nb_event_ports = NUM_PORTS,
+   .nb_events_limit = MAX_EVENTS,
+   .nb_event_queue_flows = 64,
+   .nb_event_port_dequeue_depth = DEQUEUE_BURST_SIZE,
+   .nb_event_port_enqueue_depth = ENQUEUE_BURST_SIZE
+   };
+
+   rc = rte_event_dev_configure(app->event_dev_id, &config);
+
+

[PATCH v3 3/3] doc: add dispatcher programming guide

2023-09-04 Thread Mattias Rönnblom
Provide programming guide for the dispatcher library.

Signed-off-by: Mattias Rönnblom 

--
PATCH v3:
 o Adapt guide to the dispatcher API name changes.

PATCH:
 o Improve grammar and spelling.

RFC v4:
 o Extend event matching section of the programming guide.
 o Improve grammar and spelling.
---
 MAINTAINERS  |   1 +
 doc/api/doxy-api-index.md|   1 +
 doc/api/doxy-api.conf.in |   1 +
 doc/guides/prog_guide/dispatcher_lib.rst | 434 +++
 doc/guides/prog_guide/index.rst  |   1 +
 5 files changed, 438 insertions(+)
 create mode 100644 doc/guides/prog_guide/dispatcher_lib.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index 43890cad0e..ab35498204 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1730,6 +1730,7 @@ Dispatcher - EXPERIMENTAL
 M: Mattias Rönnblom 
 F: lib/dispatcher/
 F: app/test/test_dispatcher.c
+F: doc/guides/prog_guide/dispatcher_lib.rst
 
 Test Applications
 -
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index fdeda13932..7d0cad9fed 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -155,6 +155,7 @@ The public API headers are grouped by topics:
 
 - **classification**
   [reorder](@ref rte_reorder.h),
+  [dispatcher](@ref rte_dispatcher.h),
   [distributor](@ref rte_distributor.h),
   [EFD](@ref rte_efd.h),
   [ACL](@ref rte_acl.h),
diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
index a88accd907..59c679e621 100644
--- a/doc/api/doxy-api.conf.in
+++ b/doc/api/doxy-api.conf.in
@@ -34,6 +34,7 @@ INPUT   = @TOPDIR@/doc/api/doxy-api-index.md \
   @TOPDIR@/lib/cmdline \
   @TOPDIR@/lib/compressdev \
   @TOPDIR@/lib/cryptodev \
+  @TOPDIR@/lib/dispatcher \
   @TOPDIR@/lib/distributor \
   @TOPDIR@/lib/dmadev \
   @TOPDIR@/lib/efd \
diff --git a/doc/guides/prog_guide/dispatcher_lib.rst 
b/doc/guides/prog_guide/dispatcher_lib.rst
new file mode 100644
index 00..d4f29ce7ba
--- /dev/null
+++ b/doc/guides/prog_guide/dispatcher_lib.rst
@@ -0,0 +1,434 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2023 Ericsson AB.
+
+Dispatcher
+==
+
+Overview
+
+
+The purpose of the dispatcher is to help reduce coupling in an
+:doc:`Eventdev `-based DPDK application.
+
+In particular, the dispatcher addresses a scenario where an
+application's modules share the same event device and event device
+ports, and performs work on the same lcore threads.
+
+The dispatcher replaces the conditional logic that follows an event
+device dequeue operation, where events are dispatched to different
+parts of the application, typically based on fields in the
+``rte_event``, such as the ``queue_id``, ``sub_event_type``, or
+``sched_type``.
+
+Below is an excerpt from a fictitious application consisting of two
+modules; A and B. In this example, event-to-module routing is based
+purely on queue id, where module A expects all events to a certain
+queue id, and module B two other queue ids. [#Mapping]_
+
+.. code-block:: c
+
+for (;;) {
+struct rte_event events[MAX_BURST];
+unsigned int n;
+
+n = rte_event_dequeue_burst(dev_id, port_id, events,
+   MAX_BURST, 0);
+
+for (i = 0; i < n; i++) {
+const struct rte_event *event = &events[i];
+
+switch (event->queue_id) {
+case MODULE_A_QUEUE_ID:
+module_a_process(event);
+break;
+case MODULE_B_STAGE_0_QUEUE_ID:
+module_b_process_stage_0(event);
+break;
+case MODULE_B_STAGE_1_QUEUE_ID:
+module_b_process_stage_1(event);
+break;
+}
+}
+}
+
+The issue this example attempts to illustrate is that the centralized
+conditional logic has knowledge of things that should be private to
+the modules. In other words, this pattern leads to a violation of
+module encapsulation.
+
+The shared conditional logic contains explicit knowledge about what
+events should go where. In case, for example, the
+``module_a_process()`` is broken into two processing stages — a
+module-internal affair — the shared conditional code must be updated
+to reflect this change.
+
+The centralized event routing code becomes an issue in larger
+applications, where modules are developed by different organizations.
+This pattern also makes module reuse across different application more
+difficult. The part of the conditional logic relevant for a particular
+application may need to be duplicated across many module
+instantiations (e.g., applications and test setups)

Re: [PATCH] usertools: suggest use of hwloc for new cpu

2023-09-04 Thread Thomas Monjalon
04/09/2023 12:20, Bruce Richardson:
> On Mon, Sep 04, 2023 at 11:11:20AM +0100, Ferruh Yigit wrote:
> > On 8/14/2023 10:25 AM, Konstantin Ananyev wrote:
> > > 
> > >> On Sun, Aug 13, 2023 at 08:52:01AM -0700, Stephen Hemminger wrote:
> > >>> On Sun, 13 Aug 2023 02:12:03 +
> > >>> "Varghese, Vipin"  wrote:
> > >>>
> > >
> > > On Sat, 12 Aug 2023 06:27:20 +0530
> > > Vipin Varghese  wrote:
> > >
> > >> Most modern processor now supports numa by partitioning NUMA based on
> > >> CPU-IO & Last Level Cache within the same socket.
> > >> As per the discussion in mailing list, suggesting the make use of
> > >> hw-loc for such scenarios.
> > >>
> > >> Signed-off-by: Vipin Varghese 
> > >
> > > NAK, no scripting hwloc, it is ugly and creates a dependency that is 
> > > not listed
> > > in DPDK packaging.
> > 
> >  There is no calls to hwloc within in thescript. Hence not clear what 
> >  does ` NAK, no scripting hwloc it is ugly and creates a
> > >> dependency that is not listed in DPDK packaging.`.
> > 
> >  Requesting to cross check why NAK is shared for `print` as suggestion. 
> >  Hence, I have disagree to this.
> > >>>
> > >>> Sorry, I misinterpreted what the print's were doing.
> > >>> Better off not to list exact flags, the lstopo may change and user may 
> > >>> want different
> > >>> format anyway.
> > >>>
> > >>> How about something like this?
> > >>>
> > >>>
> > >>>  doc/guides/rel_notes/deprecation.rst | 5 +
> > >>>  usertools/cpu_layout.py  | 5 +
> > >>>  2 files changed, 10 insertions(+)
> > >>>
> > >>> diff --git a/doc/guides/rel_notes/deprecation.rst 
> > >>> b/doc/guides/rel_notes/deprecation.rst
> > >>> index 317875c5054b..25a116900dfb 100644
> > >>> --- a/doc/guides/rel_notes/deprecation.rst
> > >>> +++ b/doc/guides/rel_notes/deprecation.rst
> > >>> @@ -185,3 +185,8 @@ Deprecation Notices
> > >>>will be deprecated and subsequently removed in DPDK 24.11 release.
> > >>>Before this, the new port library API (functions rte_swx_port_*)
> > >>>will gradually transition from experimental to stable status.
> > >>> +
> > >>> +* cpulayout: The CPU layout script is unable to deal with all the 
> > >>> possible
> > >>> +  complexities of modern CPU topology. Other existing tools offer more
> > >>> +  features and do a better job with keeping up with innovations.
> > >>> +  Therefore it will be deprecated and removed in a future release.
> > >>
> > >> Does the script really do that bad a job? While I can understand us 
> > >> looking
> > >> to recommend alternatives, I actually find the script in it's current 
> > >> form
> > >> really handy - much more so than working out the exact flags for lstopo
> > >> etc. Since it's not a large maintenance burden, I'd request we keep it
> > >> around - while still recommending lstopo to users.
> > > 
> > > +1
> > > I do use it on regular basis.
> > > It would be a pity if it will be gone.
> > >
> > 
> > I also use it time to time and find it useful.
> > 
> > But it is not accurate/correct for some AMD platforms (for various NPS
> > (Nodes per Socket) values).
> > So either it needs to be updated/improved or replaced.
> > 
> > Vipin sent a patch [1] to update it but it is question how much of this
> > logic belongs to DPDK, or should we rely on external tools dedicated for
> > this purpose.
> > 
> 
> I'd like to suggest that we take a slightly ambiguous position on this
> script. Specifically:
> 
> I think we should "recommend" but not "rely on" external tools for this.
> Specifically, I think that recommending use of hwloc is the best thing to
> do as it's better maintained and packaged for windows. However, for quick
> use in many situations, cpu_layout does the job as well or better in terms
> of simplicity of use and output.

We could also contribute to hwloc to have a similar simple output.




Re: [PATCH 1/2] build: fail if explicitly requested lib is unbuildable

2023-09-04 Thread David Marchand
On Fri, Sep 1, 2023 at 4:44 PM Bruce Richardson
 wrote:
>
> On Fri, Sep 01, 2023 at 04:30:56PM +0200, David Marchand wrote:
> > On Fri, Sep 1, 2023 at 4:29 PM David Marchand  
> > wrote:
> > >
> > > On Fri, Sep 1, 2023 at 4:23 PM Bruce Richardson
> > >  wrote:
> > > >
> > > > When the user passes a list of desired libraries to build via the
> > > > "enable_libs" option, the expectation is that those libraries should be
> > > > part of the build. However, if those libs have either external or
> > > > internal dependencies, they still may be silently disabled, for example:
> > > > running "meson setup -Denable_libs=security build" will successfully
> > > > run, but the security lib will not be configured as "cryptodev" is
> > > > missing.
> > > >
> > > > We can fix this by setting a flag to indicate when the libraries are
> > > > specified via an enable_libs flag. If so, then we error out when a
> > > > library is unbuildable, giving a suitable error message. For the above
> > > > example case, the "meson setup" run fails with:
> > > >
> > > > Message: Disabling security [lib/security]: missing internal dependency 
> > > > "cryptodev"
> > > >
> > > > lib/meson.build:218:16: ERROR: Problem encountered: Cannot build 
> > > > explicitly requested lib "security".
> > > > Please add missing dependency "cryptodev" to "enable_libs" 
> > > > option
> > > >
> > > > Signed-off-by: Bruce Richardson 
> > >
> > > I remember setting a enable_drivers=net/af_xdp on a system lacking
> > > libbpf-devel and not getting warned about its absence.
> > > So I suspect enable_drivers is affected by the same issue.
> >
> > Arf sorry, did not see it was a two patch series :-).
> >
> No problem.
>
> At least your response confirms to me that this is indeed a common issue,
> that could do with a solution. As I explain in patch 2, fixing for drivers
> is a little trickier than for libs, as for drivers we really need to take
> account of wildcards to avoid breaking people's existing builds. [Or, at
> minimum, my own builds! :-)].
>
> BTW: while we could look to handle wildcards for libs, I think their use is
> probably much more limited there, and I decided to go with the simplest
> possible solution instead.

We have categories (classes) of drivers, and for them, I can imagine
people using wildcards.
But for libraries, I don't see how wildcards could be used, so I think
we can ignore them for now (until proven wrong ;-).


Thanks Bruce, this series is also enhancing the situation for a user.
Like when wanting to enable explicity a driver or a library: the
explicit error message on the missing dependency is easier to catch
than scrolling back/parsing meson output.


I did not review in detail, but it lgtm for my few tests.

For the series,
Acked-by: David Marchand 


-- 
David Marchand



[PATCH v1] examples/l3fwd: relax the RSS/Offload requirement

2023-09-04 Thread Trevor Tao
Now the port Rx mq_mode had been set to RTE_ETH_MQ_RX_RSS, and offload
mode set to RTE_ETH_RX_OFFLOAD_CHECKSUM by default, but some hardware
and/or virtual interface does not support the RSS and offload mode
presupposed, e.g., some virtio interfaces in the cloud don't support
RSS and may only partly support RTE_ETH_RX_OFFLOAD_UDP_CKSUM/
RTE_ETH_RX_OFFLOAD_TCP_CKSUM,
but not RTE_ETH_RX_OFFLOAD_IPV4_CKSUM, and the error msg here:

virtio_dev_configure(): RSS support requested but not supported by
the device
Port0 dev_configure = -95

and:
Ethdev port_id=0 requested Rx offloads 0xe does not match Rx offloads
capabilities 0x201d in rte_eth_dev_configure()

So to enable the l3fwd running in that environment, the Rx mode requirement
can be relaxed to reflect the hardware feature reality here, and the l3fwd
can run smoothly then.
A warning msg would be provided to user in case it happens here.

On the other side, enabling the software cksum check in case the
hw support missing.

Fixes: af75078fece3 ("first public release")
Cc: sta...@dpdk.org

Signed-off-by: Trevor Tao 
---
 examples/l3fwd/l3fwd.h | 12 +++-
 examples/l3fwd/main.c  | 21 +++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index b55855c932..cc10643c4b 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -115,6 +115,8 @@ extern struct acl_algorithms acl_alg[];
 
 extern uint32_t max_pkt_len;
 
+extern struct rte_eth_conf port_conf;
+
 /* Send burst of packets on an output interface */
 static inline int
 send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
@@ -170,7 +172,15 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t 
link_len)
return -1;
 
/* 2. The IP checksum must be correct. */
-   /* this is checked in H/W */
+   /* if this is not checked in H/W, check it. */
+   if ((port_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_IPV4_CKSUM) == 0) {
+   uint16_t actual_cksum, expected_cksum;
+   actual_cksum = pkt->hdr_checksum;
+   pkt->hdr_checksum = 0;
+   expected_cksum = rte_ipv4_cksum(pkt);
+   if (actual_cksum != expected_cksum)
+   return -2;
+   }
 
/*
 * 3. The IP version number must be 4. If the version number is not 4
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 6063eb1399..37aec64718 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -117,7 +117,7 @@ static struct lcore_params * lcore_params = 
lcore_params_array_default;
 static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
sizeof(lcore_params_array_default[0]);
 
-static struct rte_eth_conf port_conf = {
+struct rte_eth_conf port_conf = {
.rxmode = {
.mq_mode = RTE_ETH_MQ_RX_RSS,
.offloads = RTE_ETH_RX_OFFLOAD_CHECKSUM,
@@ -1257,8 +1257,12 @@ l3fwd_poll_resource_setup(void)
local_port_conf.rx_adv_conf.rss_conf.rss_hf &=
dev_info.flow_type_rss_offloads;
 
-   if (dev_info.max_rx_queues == 1)
+   /* relax the rx rss requirement */
+   if (dev_info.max_rx_queues == 1 || 
!local_port_conf.rx_adv_conf.rss_conf.rss_hf) {
+   printf("warning: modified the rx mq_mode to 
RTE_ETH_MQ_RX_NONE base on"
+   " device capability\n");
local_port_conf.rxmode.mq_mode = RTE_ETH_MQ_RX_NONE;
+   }
 
if (local_port_conf.rx_adv_conf.rss_conf.rss_hf !=
port_conf.rx_adv_conf.rss_conf.rss_hf) {
@@ -1269,6 +1273,19 @@ l3fwd_poll_resource_setup(void)
local_port_conf.rx_adv_conf.rss_conf.rss_hf);
}
 
+   /* relax the rx offload requirement */
+   if ((local_port_conf.rxmode.offloads & 
dev_info.rx_offload_capa) !=
+   local_port_conf.rxmode.offloads) {
+   printf("Port %u requested Rx offloads 0x%"PRIx64" does 
not"
+   " match Rx offloads capabilities 0x%"PRIx64"\n",
+   portid, local_port_conf.rxmode.offloads,
+   dev_info.rx_offload_capa);
+   local_port_conf.rxmode.offloads &= 
dev_info.rx_offload_capa;
+   port_conf.rxmode.offloads = 
local_port_conf.rxmode.offloads;
+   printf("warning: modified the rx offload to 0x%"PRIx64" 
based on device"
+   " capability\n", 
local_port_conf.rxmode.offloads);
+   }
+
ret = rte_eth_dev_configure(portid, nb_rx_queue,
(uint16_t)n_tx_queue, &local_port_conf);
if (ret < 0)
-- 
2.34.1



[PATCH] net/iavf: unregister intr handler before FD close

2023-09-04 Thread Saurabh Singhal
Unregister VFIO interrupt handler before the interrupt fd gets closed in
case iavf_dev_init() returns an error.

dpdk creates a standalone thread named eal-intr-thread for processing
interrupts for the PCI devices. The interrupt handler callbacks are
registered by the VF driver(iavf, in this case).

When we do a PCI probe of the network interfaces, we register an
interrupt handler, open a vfio-device fd using ioctl, and an eventfd in
dpdk. These interrupt sources are registered in a global linked list
that the eal-intr-thread keeps iterating over for handling the
interrupts. In our internal testing, we see eal-intr-thread crash in
these two ways:

Error adding fd 660 epoll_ctl, Operation not permitted

or

Error adding fd 660 epoll_ctl, Bad file descriptor

epoll_ctl() returns EPERM if the target fd does not support poll.
It returns EBADF when the epoll fd itself is closed or the target fd is
closed.

When the first type of crash happens, we see that the fd 660 is
anon_inode:[vfio-device] which does not support poll.

When the second type of crash happens, we could see from the fd map of
the crashing process that the fd 660 was already closed.

This means the said fd has been closed and in certain cases may have
been reassigned to a different device by the operating system but the
eal-intr-thread does not know about it.

We observed that these crashes were always accompanied by an error in
iavf_dev_init() after rte_intr_callback_register() and
iavf_enable_irq0() have already happened. In the error path, the
intr_handle_fd was being closed but the interrupt handler wasn't being
unregistered.

The fix is to unregister the interrupt handle in the
iavf_dev_init() error path.

Signed-off-by: Saurabh Singhal 
---
 .mailmap   |  1 +
 drivers/net/iavf/iavf_ethdev.c | 15 +++
 2 files changed, 16 insertions(+)

diff --git a/.mailmap b/.mailmap
index 864d33ee46..4dac53011b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1227,6 +1227,7 @@ Satananda Burla 
 Satha Rao  
 Satheesh Paul 
 Sathesh Edara 
+Saurabh Singhal 
 Savinay Dharmappa 
 Scott Branden 
 Scott Daniels 
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index f2fc5a5621..df87a553db 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -133,6 +133,8 @@ static int iavf_dev_rx_queue_intr_enable(struct rte_eth_dev 
*dev,
uint16_t queue_id);
 static int iavf_dev_rx_queue_intr_disable(struct rte_eth_dev *dev,
 uint16_t queue_id);
+static void iavf_dev_interrupt_handler(void *param);
+static inline void iavf_disable_irq0(struct iavf_hw *hw);
 static int iavf_dev_flow_ops_get(struct rte_eth_dev *dev,
 const struct rte_flow_ops **ops);
 static int iavf_set_mc_addr_list(struct rte_eth_dev *dev,
@@ -2490,9 +2492,22 @@ iavf_uninit_vf(struct rte_eth_dev *dev)
 {
struct iavf_hw *hw = IAVF_DEV_PRIVATE_TO_HW(dev->data->dev_private);
struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
+   struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(dev);
+   struct rte_intr_handle *intr_handle = pci_dev->intr_handle;
 
iavf_shutdown_adminq(hw);
 
+   if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR) {
+   /* disable uio intr before callback unregiser */
+   rte_intr_disable(intr_handle);
+
+   /* unregister callback func from eal lib */
+   rte_intr_callback_unregister(intr_handle,
+iavf_dev_interrupt_handler, dev);
+   } else {
+   rte_eal_alarm_cancel(iavf_dev_alarm_handler, dev);
+   }
+
rte_free(vf->vf_res);
vf->vsi_res = NULL;
vf->vf_res = NULL;
-- 
2.41.0



Re: 22.11.3 patches review and test

2023-09-04 Thread Kevin Traynor

On 04/09/2023 10:32, Kevin Traynor wrote:

On 01/09/2023 04:23, Zeng, ZhichaoX wrote:

-Original Message-
From: Kevin Traynor 
Sent: Thursday, August 31, 2023 8:18 PM
To: Xu, HailinX ; Xueming Li ;
sta...@dpdk.org; Wu, Jingjing ; Xing, Beilei
; Xu, Ke1 ; Zeng, ZhichaoX
; Zhang, Qi Z 
Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian ;
Mcnamara, John ; Luca Boccassi
; Xu, Qian Q ; Thomas Monjalon
; Peng, Yuan ; Chen,
Zhaoyan 
Subject: Re: 22.11.3 patches review and test

On 30/08/2023 17:25, Kevin Traynor wrote:

On 29/08/2023 09:22, Xu, HailinX wrote:

-Original Message-
From: Xueming Li 
Sent: Thursday, August 17, 2023 2:14 PM
To: sta...@dpdk.org
Cc: xuemi...@nvdia.com; dev@dpdk.org; Abhishek Marathe
; Ali Alnubani ;
Walker, Benjamin ; David Christensen
; Hemant Agrawal

;

Stokes, Ian ; Jerin Jacob
; Mcnamara, John ;
Ju-Hyoung Lee ; Kevin Traynor
; Luca Boccassi ; Pei Zhang
; Xu, Qian Q ; Raslan
Darawsheh ; Thomas Monjalon
; Yanghang Liu ; Peng,
Yuan ; Chen, Zhaoyan



Subject: 22.11.3 patches review and test

Hi all,

Here is a list of patches targeted for stable release 22.11.3.

The planned date for the final release is 31th August.

Please help with testing and validation of your use cases and report
any issues/results with reply-all to this mail. For the final
release the fixes and reported validations will be added to the release

notes.


A release candidate tarball can be found at:

https://dpdk.org/browse/dpdk-stable/tag/?id=v22.11.3-rc1

These patches are located at branch 22.11 of dpdk-stable repo:
https://dpdk.org/browse/dpdk-stable/

Thanks.


We are conducting DPDK testing and have found two issues.

1. The startup speed of testpmd is significantly slower in the os of SUSE
  This issue fix patch has been merged into main, But it has not backported

to 22.11.3.

  Fix patch commit id on DPDK main:
7e7b6762eac292e78c77ad37ec0973c0c944b845

2. The SCTP tunnel packet of iavf cannot be forwarded in avx512 mode


Need to clarify this sentence. It looks like it is not a functional bug where
avx512 mode is selected and then an SCTP tunnel packet cannot be sent.
Instead, it is a possible performance issue that avx512 mode will not be
selected where it might have been due to unneeded additions
(RTE_ETH_TX_OFFLOAD_*_TNL_TSO) to IAVF_TX_NO_VECTOR_FLAGS.

@IAVF maintainers - please confirm my analysis is correct ?

In that case, as it is a possible performance issue in a specific case for a 
single
driver I think it is non-critical for 21.11 and we can just revert the patch on 
the
branch and wait for 21.11.6 release in December.


Hi Kevin,

Since the LTS version of the IAVF driver does not support avx512 checksum 
offload,
the scalar path should be selected, but this patch makes it incorrectly select 
the
avx512 path, and the SCTP tunnel packets can't be forwarded properly.



ok, let's take a look at the patch and usage.

diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 8d4a77271a..605ea3f824 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -32,4 +32,8 @@
  RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
  RTE_ETH_TX_OFFLOAD_TCP_TSO | \
+   RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
+   RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
+   RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
+   RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
  RTE_ETH_TX_OFFLOAD_SECURITY)



So we now have:
#define IAVF_TX_NO_VECTOR_FLAGS (\
RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
RTE_ETH_TX_OFFLOAD_TCP_TSO | \
RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
RTE_ETH_TX_OFFLOAD_SECURITY)



static inline int
iavf_tx_vec_queue_default(struct iavf_tx_queue *txq)
{
if (!txq)
return -1;

if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
return -1;

if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
return -1;
  ^^^ Adding the extra bits to IAVF_TX_NO_VECTOR_FLAGS gives
*more* reasons for why this statement might not be true, so returning -1
indicating that vector cannot be used for tx queue



typo here - just to clarify, the new flags give more reasons for this 
statement to be true, so returning -1.






static inline bool
check_tx_vec_allow(struct iavf_tx_queue *txq)
{
if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&

  ^^^ Adding the extra bits to IAVF_TX_NO_VECTOR_FLAGS gives
*

[PATCH] malloc: fix allocation for a specific case with ASAN

2023-09-04 Thread Artur Paszkiewicz
Allocation would fail with ASAN enabled if the size and alignment was
equal to half of the page size, e.g.:

size_t pg_sz = 2 * (1 << 20);
rte_malloc(NULL, pg_sz / 2, pg_sz / 2);

In such case, try_expand_heap_primary() only allocated one page but it
is not enough to fit this allocation with such alignment and
MALLOC_ELEM_TRAILER_LEN > 0, as correcly checked by
malloc_elem_can_hold().

Signed-off-by: Artur Paszkiewicz 
---
 lib/eal/common/malloc_heap.c | 4 ++--
 lib/eal/common/malloc_mp.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 6b6cf9174c..bb7da0d2ef 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -402,8 +402,8 @@ try_expand_heap_primary(struct malloc_heap *heap, uint64_t 
pg_sz,
int n_segs;
bool callback_triggered = false;
 
-   alloc_sz = RTE_ALIGN_CEIL(RTE_ALIGN_CEIL(elt_size, align) +
-   MALLOC_ELEM_OVERHEAD, pg_sz);
+   alloc_sz = RTE_ALIGN_CEIL(RTE_MAX(MALLOC_ELEM_HEADER_LEN, align) +
+   elt_size + MALLOC_ELEM_TRAILER_LEN, pg_sz);
n_segs = alloc_sz / pg_sz;
 
/* we can't know in advance how many pages we'll need, so we malloc */
diff --git a/lib/eal/common/malloc_mp.c b/lib/eal/common/malloc_mp.c
index 7270c2ec90..62deaca9eb 100644
--- a/lib/eal/common/malloc_mp.c
+++ b/lib/eal/common/malloc_mp.c
@@ -250,8 +250,8 @@ handle_alloc_request(const struct malloc_mp_req *m,
return -1;
}
 
-   alloc_sz = RTE_ALIGN_CEIL(RTE_ALIGN_CEIL(ar->elt_size, ar->align) +
-   MALLOC_ELEM_OVERHEAD, ar->page_sz);
+   alloc_sz = RTE_ALIGN_CEIL(RTE_MAX(MALLOC_ELEM_HEADER_LEN, ar->align) +
+   ar->elt_size + MALLOC_ELEM_TRAILER_LEN, ar->page_sz);
n_segs = alloc_sz / ar->page_sz;
 
/* we can't know in advance how many pages we'll need, so we malloc */
-- 
2.35.3



Re: [PATCH v4] bus/pci: fix legacy device IO port map in secondary process

2023-09-04 Thread Gupta, Nipun




On 8/30/2023 10:37 AM, Wenwu Ma wrote:

When doing IO port mapping for legacy device in secondary process, the
region information is missing, so, we need to refill it.

Fixes: 4b741542ecde ("bus/pci: avoid depending on private kernel value")
Cc: sta...@dpdk.org

Signed-off-by: Wenwu Ma 


Acked-by: Nipun Gupta 


Re: [PATCH 2/2] event/sw: fix ordering corruption with op release

2023-09-04 Thread Bruce Richardson
On Thu, Aug 31, 2023 at 05:47:36PM +0100, Harry van Haaren wrote:
> This commit changes the logic in the scheduler to always
> reset reorder-buffer entries in the QE_FLAG_COMPLETE path,
> and not just the QE_FLAG_VALID path.
> 
> A release event is a COMPLETE but not VALID (no new event).
> As a result, release events previously left the history-list
> in an inconsistent state, and future events with op type of
> forward could be incorrectly reordered.
> 
> Signed-off-by: Harry van Haaren 
> ---
>  drivers/event/sw/sw_evdev_scheduler.c | 45 ---
>  1 file changed, 27 insertions(+), 18 deletions(-)
> 
Hi Harry,

wondering if this fix might work as well, and offer a simpler alternative.
We can instead zero all unspecified hist-list entries on write.

/Bruce

--- a/drivers/event/sw/sw_evdev_scheduler.c
+++ b/drivers/event/sw/sw_evdev_scheduler.c
@@ -90,8 +90,7 @@ sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid * 
const qid,
sw->cq_ring_space[cq]--;
 
int head = (p->hist_head++ & (SW_PORT_HIST_LIST-1));
-   p->hist_list[head].fid = flow_id;
-   p->hist_list[head].qid = qid_id;
+   p->hist_list[head] = (struct sw_hist_list_entry){ .qid = 
qid_id, .fid = flow_id };
 
p->stats.tx_pkts++;
qid->stats.tx_pkts++;
@@ -162,8 +161,8 @@ sw_schedule_parallel_to_cq(struct sw_evdev *sw, struct 
sw_qid * const qid,
qid->stats.tx_pkts++;
 
const int head = (p->hist_head & (SW_PORT_HIST_LIST-1));
-   p->hist_list[head].fid = SW_HASH_FLOWID(qe->flow_id);
-   p->hist_list[head].qid = qid_id;
+   const uint32_t fid = SW_HASH_FLOWID(qe->flow_id);
+   p->hist_list[head] = (struct sw_hist_list_entry){ .qid = 
qid_id, .fid = fid };
 
if (keep_order)
rob_ring_dequeue(qid->reorder_buffer_freelist,



RE: [RFC] lib/st_ring: add single thread ring

2023-09-04 Thread Honnappa Nagarahalli



> -Original Message-
> From: Konstantin Ananyev 
> Sent: Monday, September 4, 2023 5:13 AM
> To: Honnappa Nagarahalli ;
> jack...@nvidia.com; konstantin.v.anan...@yandex.ru
> Cc: dev@dpdk.org; Ruifeng Wang ; Aditya
> Ambadipudi ; Wathsala Wathawana Vithanage
> ; nd 
> Subject: RE: [RFC] lib/st_ring: add single thread ring
> 
> 
> 
> > Add a single thread safe and multi-thread unsafe ring data structure.
> > This library provides an simple and efficient alternative to
> > multi-thread safe ring when multi-thread safety is not required.
> 
> Just a thought: do we really need whole new library for that?
> From what I understand all we need right now just one extra function:
> rte_ring_mt_unsafe_prod_deque(...)
> Sorry for ugly name :)
> To dequeue N elems from prod.tail.
> Or you think there would be some extra advantages in ST version of the ring:
> extra usages, better performance, etc.?
There are multiple implementations of the ST ring being used in other parts of 
DPDK. Mattias Ronnblom pointed out some (distributed scheduler, eth RX adapter, 
cmdline) [1] existing ones which will be replaced by this one.
This implementation will not use atomic instructions, head and tail indices 
will be in the same cache line and it will be a double ended queue. So, I am 
expecting better perf and more use cases (some might not be applicable 
currently).

[1] https://mails.dpdk.org/archives/dev/2023-August/275003.html

> 
> >
> > Signed-off-by: Honnappa Nagarahalli 
> > ---
> > v1:
> > 1) The code is very prelimnary and is not even compiled
> > 2) This is intended to show the APIs and some thoughts on
> > implementation
> > 3) More APIs and the rest of the implementation will come in subsequent
> >versions
> >
> >  lib/st_ring/rte_st_ring.h | 567
> > ++
> >  1 file changed, 567 insertions(+)
> >  create mode 100644 lib/st_ring/rte_st_ring.h
> >
> > diff --git a/lib/st_ring/rte_st_ring.h b/lib/st_ring/rte_st_ring.h new
> > file mode 100644 index 00..8cb8832591
> > --- /dev/null
> > +++ b/lib/st_ring/rte_st_ring.h
> > @@ -0,0 +1,567 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2023 Arm Limited
> > + */
> > +
> > +#ifndef _RTE_ST_RING_H_
> > +#define _RTE_ST_RING_H_
> > +
> > +/**
> > + * @file
> > + * RTE Signle Thread Ring (ST Ring)
> > + *
> > + * The ST Ring is a fixed-size queue intended to be accessed
> > + * by one thread at a time. It does not provide concurrent access to
> > + * multiple threads. If there are multiple threads accessing the ST
> > +ring,
> > + * then the threads have to use locks to protect the ring from
> > + * getting corrupted.
> > + *
> > + * - FIFO (First In First Out)
> > + * - Maximum size is fixed; the pointers are stored in a table.
> > + * - Consumer and producer part of same thread.
> > + * - Multi-thread producers and consumers need locking.
> > + * - Single/Bulk/burst dequeue at Tail or Head
> > + * - Single/Bulk/burst enqueue at Head or Tail
> > + *
> > + */
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include 
> > +#include 
> > +
> > +/**
> > + * Calculate the memory size needed for a ST ring
> > + *
> > + * This function returns the number of bytes needed for a ST ring,
> > +given
> > + * the number of elements in it. This value is the sum of the size of
> > + * the structure rte_st_ring and the size of the memory needed by the
> > + * elements. The value is aligned to a cache line size.
> > + *
> > + * @param count
> > + *   The number of elements in the ring (must be a power of 2).
> > + * @return
> > + *   - The memory size needed for the ST ring on success.
> > + *   - -EINVAL if count is not a power of 2.
> > + */
> > +ssize_t rte_st_ring_get_memsize(unsigned int count);
> > +
> > +/**
> > + * Initialize a ST ring structure.
> > + *
> > + * Initialize a ST ring structure in memory pointed by "r". The size
> > +of the
> > + * memory area must be large enough to store the ring structure and
> > +the
> > + * object table. It is advised to use rte_st_ring_get_memsize() to
> > +get the
> > + * appropriate size.
> > + *
> > + * The ST ring size is set to *count*, which must be a power of two.
> > + * The real usable ring size is *count-1* instead of *count* to
> > + * differentiate a full ring from an empty ring.
> > + *
> > + * The ring is not added in RTE_TAILQ_ST_RING global list. Indeed,
> > +the
> > + * memory given by the caller may not be shareable among dpdk
> > + * processes.
> > + *
> > + * @param r
> > + *   The pointer to the ring structure followed by the elements table.
> > + * @param name
> > + *   The name of the ring.
> > + * @param count
> > + *   The number of elements in the ring (must be a power of 2,
> > + *   unless RTE_ST_RING_F_EXACT_SZ is set in flags).
> > + * @param flags
> > + *   An OR of the following:
> > + *   - RTE_ST_RING_F_EXACT_SZ: If this flag is set, the ring will hold
> > + * exactly the requested number of entries, and the reques

Re: quick thread in DLB2

2023-09-04 Thread Sevincer, Abdullah
Hi Thomas,

That's right we need to create threads on specific CPUs.

From: Thomas Monjalon 
Sent: Friday, September 1, 2023 7:09 AM
To: Sevincer, Abdullah 
Cc: dev@dpdk.org ; Tyler Retzlaff 
Subject: quick thread in DLB2

Hello Abdullah,

In the DLB2 code, I see a thread is created for a single operation:
In drivers/event/dlb2/pf/base/dlb2_resource.c
pthread_create(&pthread, NULL, &dlb2_pp_profile_func, &dlb2_thread_data[i]);
and just after:
pthread_join(pthread, NULL);

Can we avoid creating this thread?
I guess no, because it must spawn on a specific CPU.




RE: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on reconfigure

2023-09-04 Thread Zhang, Qi Z


> -Original Message-
> From: Richardson, Bruce 
> Sent: Monday, September 4, 2023 3:54 PM
> To: Zhang, Qi Z 
> Cc: dev@dpdk.org; Wu, Jingjing ; sta...@dpdk.org
> Subject: Re: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on reconfigure
> 
> On Mon, Sep 04, 2023 at 02:30:32AM +0100, Zhang, Qi Z wrote:
> >
> >
> > > -Original Message-
> > > From: Zhang, Qi Z
> > > Sent: Monday, September 4, 2023 9:15 AM
> > > To: Bruce Richardson ; dev@dpdk.org
> > > Cc: Richardson, Bruce ; Wu, Jingjing
> > > ; sta...@dpdk.org
> > > Subject: RE: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on
> > > reconfigure
> > >
> > >
> > >
> > > > -Original Message-
> > > > From: Bruce Richardson 
> > > > Sent: Thursday, August 31, 2023 8:34 PM
> > > > To: dev@dpdk.org
> > > > Cc: Richardson, Bruce ; Wu, Jingjing
> > > > ; sta...@dpdk.org
> > > > Subject: [PATCH v2 3/4] net/iavf: fix restart of Rx queue on
> > > > reconfigure
> > > >
> > > > After reconfiguring an RX queue the mbuf_initializer value was not
> > > > being correctly set. Fix this by calling the appropriate function
> > > > if vector processing is enabled. This mirrors the behaviour by the i40e
> driver.
> > > >
> > > > Fixes: 319c421f3890 ("net/avf: enable SSE Rx Tx")
> > > > Cc: jingjing...@intel.com
> > > > Cc: sta...@dpdk.org
> > > >
> > > > Signed-off-by: Bruce Richardson 
> > > > ---
> > > >  drivers/net/iavf/iavf_rxtx.c | 7 +++
> > > >  1 file changed, 7 insertions(+)
> > > >
> > > > diff --git a/drivers/net/iavf/iavf_rxtx.c
> > > > b/drivers/net/iavf/iavf_rxtx.c index
> > > > f7df4665d1..797cdda4b2 100644
> > > > --- a/drivers/net/iavf/iavf_rxtx.c
> > > > +++ b/drivers/net/iavf/iavf_rxtx.c
> > > > @@ -755,6 +755,13 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev
> > > > *dev, uint16_t queue_idx,
> > > > if (check_rx_vec_allow(rxq) == false)
> > > > ad->rx_vec_allowed = false;
> > > >
> > > > +#if defined RTE_ARCH_X86 || defined RTE_ARCH_ARM
> > > > +   /* check vector conflict */
> > > > +   if (ad->rx_vec_allowed && iavf_rxq_vec_setup(rxq)) {
> > > > +   PMD_DRV_LOG(ERR, "Failed vector rx setup.");
> > > > +   return -EINVAL;
> > > > +   }
> > > > +#endif
> > >
> > > Bruce:
> > >
> > >   May I know more details about how to reproduce this issue?
> > >   As the iavf PMD does not support
> > > RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP (i40e does)
> >
> > OK, not sure if the patch 4/4 answered my question 😊
> >
> > should I squash patch 3, 4 into one? , for my understanding patch 3 doesn't
> appear to be a bug fix unless we announce
> RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP.
> >
> You may have a point. I was experimenting with queue reconfiguration which
> is where I hit this issue.
> However, even without queue reconfig support, the device still needs to
> support queue-stop followed by queue-start, I think, and there may still be an
> issue there - I'll have to check.

Yes, queue start / stop should be supported.
Btw, I didn't see mbuf_initializer has been reset during queue stop / start, it 
might be a different issue.




> /Bruce


RE: 22.11.3 patches review and test

2023-09-04 Thread Zeng, ZhichaoX


Regards
Zhichao
> -Original Message-
> From: Kevin Traynor 
> Sent: Monday, September 4, 2023 10:15 PM
> To: Zeng, ZhichaoX ; Xu, HailinX
> ; Xueming Li ;
> sta...@dpdk.org; Wu, Jingjing ; Xing, Beilei
> ; Xu, Ke1 ; Zhang, Qi Z
> 
> Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian ;
> Mcnamara, John ; Luca Boccassi
> ; Xu, Qian Q ; Thomas Monjalon
> ; Peng, Yuan ; Chen,
> Zhaoyan 
> Subject: Re: 22.11.3 patches review and test
> 
> On 04/09/2023 10:32, Kevin Traynor wrote:
> > On 01/09/2023 04:23, Zeng, ZhichaoX wrote:
> >>> -Original Message-
> >>> From: Kevin Traynor 
> >>> Sent: Thursday, August 31, 2023 8:18 PM
> >>> To: Xu, HailinX ; Xueming Li
> >>> ; sta...@dpdk.org; Wu, Jingjing
> >>> ; Xing, Beilei ; Xu,
> >>> Ke1 ; Zeng, ZhichaoX ;
> >>> Zhang, Qi Z 
> >>> Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian
> >>> ; Mcnamara, John ;
> >>> Luca Boccassi ; Xu, Qian Q ;
> >>> Thomas Monjalon ; Peng, Yuan
> >>> ; Chen, Zhaoyan 
> >>> Subject: Re: 22.11.3 patches review and test
> >>>
> >>> On 30/08/2023 17:25, Kevin Traynor wrote:
>  On 29/08/2023 09:22, Xu, HailinX wrote:
> >> -Original Message-
> >> From: Xueming Li 
> >> Sent: Thursday, August 17, 2023 2:14 PM
> >> To: sta...@dpdk.org
> >> Cc: xuemi...@nvdia.com; dev@dpdk.org; Abhishek Marathe
> >> ; Ali Alnubani
> >> ; Walker, Benjamin
> >> ; David Christensen
> >> ; Hemant Agrawal
> >>> ;
> >> Stokes, Ian ; Jerin Jacob
> >> ; Mcnamara, John
> ;
> >> Ju-Hyoung Lee ; Kevin Traynor
> >> ; Luca Boccassi ; Pei
> >> Zhang ; Xu, Qian Q ;
> >> Raslan Darawsheh ; Thomas Monjalon
> >> ; Yanghang Liu ;
> Peng,
> >> Yuan ; Chen, Zhaoyan
> >>> 
> >> Subject: 22.11.3 patches review and test
> >>
> >> Hi all,
> >>
> >> Here is a list of patches targeted for stable release 22.11.3.
> >>
> >> The planned date for the final release is 31th August.
> >>
> >> Please help with testing and validation of your use cases and
> >> report any issues/results with reply-all to this mail. For the
> >> final release the fixes and reported validations will be added to
> >> the release
> >>> notes.
> >>
> >> A release candidate tarball can be found at:
> >>
> >> https://dpdk.org/browse/dpdk-stable/tag/?id=v22.11.3-rc1
> >>
> >> These patches are located at branch 22.11 of dpdk-stable repo:
> >> https://dpdk.org/browse/dpdk-stable/
> >>
> >> Thanks.
> >
> > We are conducting DPDK testing and have found two issues.
> >
> > 1. The startup speed of testpmd is significantly slower in the os of 
> > SUSE
> >   This issue fix patch has been merged into main, But it has
> > not backported
> >>> to 22.11.3.
> >   Fix patch commit id on DPDK main:
> > 7e7b6762eac292e78c77ad37ec0973c0c944b845
> >
> > 2. The SCTP tunnel packet of iavf cannot be forwarded in avx512
> > mode
> >>>
> >>> Need to clarify this sentence. It looks like it is not a functional
> >>> bug where
> >>> avx512 mode is selected and then an SCTP tunnel packet cannot be sent.
> >>> Instead, it is a possible performance issue that avx512 mode will
> >>> not be selected where it might have been due to unneeded additions
> >>> (RTE_ETH_TX_OFFLOAD_*_TNL_TSO) to IAVF_TX_NO_VECTOR_FLAGS.
> >>>
> >>> @IAVF maintainers - please confirm my analysis is correct ?
> >>>
> >>> In that case, as it is a possible performance issue in a specific
> >>> case for a single driver I think it is non-critical for 21.11 and we
> >>> can just revert the patch on the branch and wait for 21.11.6 release in
> December.
> >>
> >> Hi Kevin,
> >>
> >> Since the LTS version of the IAVF driver does not support avx512
> >> checksum offload, the scalar path should be selected, but this patch
> >> makes it incorrectly select the
> >> avx512 path, and the SCTP tunnel packets can't be forwarded properly.
> >>
> >
> > ok, let's take a look at the patch and usage.
> >
> > diff --git a/drivers/net/iavf/iavf_rxtx.h
> > b/drivers/net/iavf/iavf_rxtx.h index 8d4a77271a..605ea3f824 100644
> > --- a/drivers/net/iavf/iavf_rxtx.h
> > +++ b/drivers/net/iavf/iavf_rxtx.h
> > @@ -32,4 +32,8 @@
> >   RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
> >   RTE_ETH_TX_OFFLOAD_TCP_TSO | \
> > +   RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
> > +   RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
> > +   RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
> > +   RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
> >   RTE_ETH_TX_OFFLOAD_SECURITY)
> >
> > 
> >
> > So we now have:
> > #define IAVF_TX_NO_VECTOR_FLAGS (\
> > RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
> > RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
> > RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
> > RTE_ETH

RE: [PATCH v2 4/4] net/iavf: add support for runtime queue reconfiguration

2023-09-04 Thread Zhang, Qi Z



> -Original Message-
> From: Bruce Richardson 
> Sent: Thursday, August 31, 2023 8:34 PM
> To: dev@dpdk.org
> Cc: Richardson, Bruce 
> Subject: [PATCH v2 4/4] net/iavf: add support for runtime queue
> reconfiguration
> 
> Unlike the i40e driver, the iavf driver does not advertise support for runtime
> reconfiguration, meaning that application using the same hardware may get
> different behaviour when using a PF vs a VF. On testing with a 40G NIC, the
> only blocker to reconfiguring an RX queue on the fly is the fact that this
> support is not advertised by the driver.
> 
> Add support for runtime reconfig by reporting it in the device info flags.
> 
> Signed-off-by: Bruce Richardson 
> ---
>  doc/guides/nics/features/iavf.ini | 2 ++
>  drivers/net/iavf/iavf_ethdev.c| 4 +++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/nics/features/iavf.ini
> b/doc/guides/nics/features/iavf.ini
> index b72cd98484..db4f92ce71 100644
> --- a/doc/guides/nics/features/iavf.ini
> +++ b/doc/guides/nics/features/iavf.ini
> @@ -11,6 +11,8 @@ Speed capabilities   = Y
>  Link status  = Y
>  Rx interrupt = Y
>  Queue start/stop = Y
> +Runtime Rx queue setup = Y
> +Runtime Tx queue setup = Y
>  Power mgmt address monitor = Y
>  MTU update   = Y
>  Scattered Rx = Y
> diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
> index f2fc5a5621..22fbd7d6b2 100644
> --- a/drivers/net/iavf/iavf_ethdev.c
> +++ b/drivers/net/iavf/iavf_ethdev.c
> @@ -1127,7 +1127,9 @@ iavf_dev_info_get(struct rte_eth_dev *dev, struct
> rte_eth_dev_info *dev_info)
>   dev_info->reta_size = vf->vf_res->rss_lut_size;
>   dev_info->flow_type_rss_offloads = IAVF_RSS_OFFLOAD_ALL;
>   dev_info->max_mac_addrs = IAVF_NUM_MACADDR_MAX;
> - dev_info->dev_capa &= ~RTE_ETH_DEV_CAPA_FLOW_RULE_KEEP;
> + dev_info->dev_capa =
> + RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP |
> + RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP;
>   dev_info->rx_offload_capa =
>   RTE_ETH_RX_OFFLOAD_VLAN_STRIP |
>   RTE_ETH_RX_OFFLOAD_QINQ_STRIP |
> --
> 2.39.2

Acked-by: Qi Zhang 

Squashed with Patch 3/4, applied to dpdk-next-net-intel.

Thanks
Qi



RE: [PATCH v4] bus/pci: fix legacy device IO port map in secondary process

2023-09-04 Thread Ling, WeiX
> -Original Message-
> From: Ma, WenwuX 
> Sent: Wednesday, August 30, 2023 1:07 PM
> To: nipun.gu...@amd.com; dev@dpdk.org
> Cc: david.march...@redhat.com; maxime.coque...@redhat.com; Xia,
> Chenbo ; Li, Miao ; Ling, WeiX
> ; Ma, WenwuX ;
> sta...@dpdk.org
> Subject: [PATCH v4] bus/pci: fix legacy device IO port map in secondary
> process
> 
> When doing IO port mapping for legacy device in secondary process, the
> region information is missing, so, we need to refill it.
> 
> Fixes: 4b741542ecde ("bus/pci: avoid depending on private kernel value")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Wenwu Ma 

Tested-by: Wei Ling 


RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode

2023-09-04 Thread Feifei Wang



> -Original Message-
> From: Konstantin Ananyev 
> Sent: Monday, September 4, 2023 6:22 PM
> To: Feifei Wang ; Konstantin Ananyev
> 
> Cc: dev@dpdk.org; nd ; Honnappa Nagarahalli
> ; Ruifeng Wang
> ; Yuying Zhang ; Beilei
> Xing ; nd ; nd ; nd
> ; nd 
> Subject: RE: [PATCH v11 2/4] net/i40e: implement mbufs recycle mode
> 
> 
> 
> > > > > > > > > Define specific function implementation for i40e driver.
> > > > > > > > > Currently, mbufs recycle mode can support 128bit vector
> > > > > > > > > path and
> > > > > > > > > avx2
> > > > > > > path.
> > > > > > > > > And can be enabled both in fast free and no fast free mode.
> > > > > > > > >
> > > > > > > > > Suggested-by: Honnappa Nagarahalli
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Feifei Wang 
> > > > > > > > > Reviewed-by: Ruifeng Wang 
> > > > > > > > > Reviewed-by: Honnappa Nagarahalli
> > > > > 
> > > > > > > > > ---
> > > > > > > > >  drivers/net/i40e/i40e_ethdev.c|   1 +
> > > > > > > > >  drivers/net/i40e/i40e_ethdev.h|   2 +
> > > > > > > > >  .../net/i40e/i40e_recycle_mbufs_vec_common.c  | 147
> > > > > > > > > ++
> > > > > > > > >  drivers/net/i40e/i40e_rxtx.c  |  32 
> > > > > > > > >  drivers/net/i40e/i40e_rxtx.h  |   4 +
> > > > > > > > >  drivers/net/i40e/meson.build  |   1 +
> > > > > > > > >  6 files changed, 187 insertions(+)  create mode 100644
> > > > > > > > > drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > > b/drivers/net/i40e/i40e_ethdev.c index
> > > > > > > > > 8271bbb394..50ba9aac94
> > > > > > > > > 100644
> > > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.c
> > > > > > > > > @@ -496,6 +496,7 @@ static const struct eth_dev_ops
> > > > > > > > > i40e_eth_dev_ops
> > > > > > > = {
> > > > > > > > >   .flow_ops_get = i40e_dev_flow_ops_get,
> > > > > > > > >   .rxq_info_get = i40e_rxq_info_get,
> > > > > > > > >   .txq_info_get = i40e_txq_info_get,
> > > > > > > > > + .recycle_rxq_info_get = 
> > > > > > > > > i40e_recycle_rxq_info_get,
> > > > > > > > >   .rx_burst_mode_get= i40e_rx_burst_mode_get,
> > > > > > > > >   .tx_burst_mode_get= i40e_tx_burst_mode_get,
> > > > > > > > >   .timesync_enable  = i40e_timesync_enable,
> > > > > > > > > diff --git a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > > b/drivers/net/i40e/i40e_ethdev.h index
> > > > > > > > > 6f65d5e0ac..af758798e1
> > > > > > > > > 100644
> > > > > > > > > --- a/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > > +++ b/drivers/net/i40e/i40e_ethdev.h
> > > > > > > > > @@ -1355,6 +1355,8 @@ void i40e_rxq_info_get(struct
> > > > > > > > > rte_eth_dev *dev, uint16_t queue_id,
> > > > > > > > >   struct rte_eth_rxq_info *qinfo);  void
> > > > > > > > > i40e_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
> > > > > > > > >   struct rte_eth_txq_info *qinfo);
> > > > > > > > > +void i40e_recycle_rxq_info_get(struct rte_eth_dev *dev,
> > > > > > > > > +uint16_t
> > > > > > > queue_id,
> > > > > > > > > + struct rte_eth_recycle_rxq_info *recycle_rxq_info);
> > > > > > > > >  int i40e_rx_burst_mode_get(struct rte_eth_dev *dev,
> > > > > > > > > uint16_t
> > > > > queue_id,
> > > > > > > > >  struct rte_eth_burst_mode *mode);  
> > > > > > > > > int
> > > > > > > > > i40e_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t
> > > > > > > > > queue_id, diff -- git
> > > > > > > > > a/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > > b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 00..5663ecccde
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/drivers/net/i40e/i40e_recycle_mbufs_vec_common.c
> > > > > > > > > @@ -0,0 +1,147 @@
> > > > > > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > > > > > + * Copyright (c) 2023 Arm Limited.
> > > > > > > > > + */
> > > > > > > > > +
> > > > > > > > > +#include 
> > > > > > > > > +#include 
> > > > > > > > > +
> > > > > > > > > +#include "base/i40e_prototype.h"
> > > > > > > > > +#include "base/i40e_type.h"
> > > > > > > > > +#include "i40e_ethdev.h"
> > > > > > > > > +#include "i40e_rxtx.h"
> > > > > > > > > +
> > > > > > > > > +#pragma GCC diagnostic ignored "-Wcast-qual"
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +i40e_recycle_rx_descriptors_refill_vec(void *rx_queue,
> > > > > > > > > +uint16_t
> > > > > > > > > +nb_mbufs) {
> > > > > > > > > + struct i40e_rx_queue *rxq = rx_queue;
> > > > > > > > > + struct i40e_rx_entry *rxep;
> > > > > > > > > + volatile union i40e_rx_desc *rxdp;
> > > > > > > > > + uint16_t rx_id;
> > > > > > > > > + uint64_t paddr;
> > > > > > > > 

Recall: 22.11.3 patches review and test

2023-09-04 Thread Zeng, ZhichaoX
Zeng, ZhichaoX would like to recall the message, "22.11.3 patches review and 
test".

RE: 22.11.3 patches review and test

2023-09-04 Thread Zeng, ZhichaoX
> -Original Message-
> From: Kevin Traynor 
> Sent: Monday, September 4, 2023 10:15 PM
> To: Zeng, ZhichaoX ; Xu, HailinX
> ; Xueming Li ;
> sta...@dpdk.org; Wu, Jingjing ; Xing, Beilei
> ; Xu, Ke1 ; Zhang, Qi Z
> 
> Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian ;
> Mcnamara, John ; Luca Boccassi
> ; Xu, Qian Q ; Thomas Monjalon
> ; Peng, Yuan ; Chen,
> Zhaoyan 
> Subject: Re: 22.11.3 patches review and test
> 
> On 04/09/2023 10:32, Kevin Traynor wrote:
> > On 01/09/2023 04:23, Zeng, ZhichaoX wrote:
> >>> -Original Message-
> >>> From: Kevin Traynor 
> >>> Sent: Thursday, August 31, 2023 8:18 PM
> >>> To: Xu, HailinX ; Xueming Li
> >>> ; sta...@dpdk.org; Wu, Jingjing
> >>> ; Xing, Beilei ; Xu,
> >>> Ke1 ; Zeng, ZhichaoX ;
> >>> Zhang, Qi Z 
> >>> Cc: xuemi...@nvdia.com; dev@dpdk.org; Stokes, Ian
> >>> ; Mcnamara, John ;
> >>> Luca Boccassi ; Xu, Qian Q ;
> >>> Thomas Monjalon ; Peng, Yuan
> >>> ; Chen, Zhaoyan 
> >>> Subject: Re: 22.11.3 patches review and test
> >>>
> >>> On 30/08/2023 17:25, Kevin Traynor wrote:
>  On 29/08/2023 09:22, Xu, HailinX wrote:
> >> -Original Message-
> >> From: Xueming Li 
> >> Sent: Thursday, August 17, 2023 2:14 PM
> >> To: sta...@dpdk.org
> >> Cc: xuemi...@nvdia.com; dev@dpdk.org; Abhishek Marathe
> >> ; Ali Alnubani
> >> ; Walker, Benjamin
> >> ; David Christensen
> >> ; Hemant Agrawal
> >>> ;
> >> Stokes, Ian ; Jerin Jacob
> >> ; Mcnamara, John
> ;
> >> Ju-Hyoung Lee ; Kevin Traynor
> >> ; Luca Boccassi ; Pei
> >> Zhang ; Xu, Qian Q ;
> >> Raslan Darawsheh ; Thomas Monjalon
> >> ; Yanghang Liu ;
> Peng,
> >> Yuan ; Chen, Zhaoyan
> >>> 
> >> Subject: 22.11.3 patches review and test
> >>
> >> Hi all,
> >>
> >> Here is a list of patches targeted for stable release 22.11.3.
> >>
> >> The planned date for the final release is 31th August.
> >>
> >> Please help with testing and validation of your use cases and
> >> report any issues/results with reply-all to this mail. For the
> >> final release the fixes and reported validations will be added to
> >> the release
> >>> notes.
> >>
> >> A release candidate tarball can be found at:
> >>
> >> https://dpdk.org/browse/dpdk-stable/tag/?id=v22.11.3-rc1
> >>
> >> These patches are located at branch 22.11 of dpdk-stable repo:
> >> https://dpdk.org/browse/dpdk-stable/
> >>
> >> Thanks.
> >
> > We are conducting DPDK testing and have found two issues.
> >
> > 1. The startup speed of testpmd is significantly slower in the os of 
> > SUSE
> >   This issue fix patch has been merged into main, But it has
> > not backported
> >>> to 22.11.3.
> >   Fix patch commit id on DPDK main:
> > 7e7b6762eac292e78c77ad37ec0973c0c944b845
> >
> > 2. The SCTP tunnel packet of iavf cannot be forwarded in avx512
> > mode
> >>>
> >>> Need to clarify this sentence. It looks like it is not a functional
> >>> bug where
> >>> avx512 mode is selected and then an SCTP tunnel packet cannot be sent.
> >>> Instead, it is a possible performance issue that avx512 mode will
> >>> not be selected where it might have been due to unneeded additions
> >>> (RTE_ETH_TX_OFFLOAD_*_TNL_TSO) to IAVF_TX_NO_VECTOR_FLAGS.
> >>>
> >>> @IAVF maintainers - please confirm my analysis is correct ?
> >>>
> >>> In that case, as it is a possible performance issue in a specific
> >>> case for a single driver I think it is non-critical for 21.11 and we
> >>> can just revert the patch on the branch and wait for 21.11.6 release in
> December.
> >>
> >> Hi Kevin,
> >>
> >> Since the LTS version of the IAVF driver does not support avx512
> >> checksum offload, the scalar path should be selected, but this patch
> >> makes it incorrectly select the
> >> avx512 path, and the SCTP tunnel packets can't be forwarded properly.
> >>
> >
> > ok, let's take a look at the patch and usage.
> >
> > diff --git a/drivers/net/iavf/iavf_rxtx.h
> > b/drivers/net/iavf/iavf_rxtx.h index 8d4a77271a..605ea3f824 100644
> > --- a/drivers/net/iavf/iavf_rxtx.h
> > +++ b/drivers/net/iavf/iavf_rxtx.h
> > @@ -32,4 +32,8 @@
> >   RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
> >   RTE_ETH_TX_OFFLOAD_TCP_TSO | \
> > +   RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |   \
> > +   RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO | \
> > +   RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |\
> > +   RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |  \
> >   RTE_ETH_TX_OFFLOAD_SECURITY)
> >
> > 
> >
> > So we now have:
> > #define IAVF_TX_NO_VECTOR_FLAGS (\
> > RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
> > RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
> > RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
> > RTE_ETH_TX_OFFLOAD_TCP_TS

Re: [RFC] cache guard

2023-09-04 Thread Mattias Rönnblom

On 2023-09-04 14:48, Morten Brørup wrote:

From: Mattias Rönnblom [mailto:hof...@lysator.liu.se]
Sent: Monday, 4 September 2023 14.07

On 2023-09-01 20:52, Morten Brørup wrote:

From: Mattias Rönnblom [mailto:hof...@lysator.liu.se]
Sent: Friday, 1 September 2023 18.58

On 2023-09-01 14:26, Thomas Monjalon wrote:

27/08/2023 10:34, Morten Brørup:

+CC Honnappa and Konstantin, Ring lib maintainers
+CC Mattias, PRNG lib maintainer


From: Bruce Richardson [mailto:bruce.richard...@intel.com]
Sent: Friday, 25 August 2023 11.24

On Fri, Aug 25, 2023 at 11:06:01AM +0200, Morten Brørup wrote:

+CC mempool maintainers


From: Bruce Richardson [mailto:bruce.richard...@intel.com]
Sent: Friday, 25 August 2023 10.23

On Fri, Aug 25, 2023 at 08:45:12AM +0200, Morten Brørup wrote:

Bruce,

With this patch [1], it is noted that the ring producer and

consumer data

should not be on adjacent cache lines, for performance reasons.


[1]:







https://git.dpdk.org/dpdk/commit/lib/librte_ring/rte_ring.h?id=d9f0d3a1f

fd4b66

e75485cc8b63b9aedfbdfe8b0


(It's obvious that they cannot share the same cache line,

because

they are

accessed by two different threads.)


Intuitively, I would think that having them on different cache

lines would

suffice. Why does having an empty cache line between them make a

difference?


And does it need to be an empty cache line? Or does it suffice

having the

second structure start at two cache lines after the start of the

first

structure (e.g. if the size of the first structure is two cache

lines)?


I'm asking because the same principle might apply to other code

too.



Hi Morten,

this was something we discovered when working on the distributor

library.

If we have cachelines per core where there is heavy access,

having

some

cachelines as a gap between the content cachelines can help

performance. We

believe this helps due to avoiding issues with the HW

prefetchers

(e.g.

adjacent cacheline prefetcher) bringing in the second cacheline
speculatively when an operation is done on the first line.


I guessed that it had something to do with speculative

prefetching,

but wasn't sure. Good to get confirmation, and that it has a

measureable

effect somewhere. Very interesting!


NB: More comments in the ring lib about stuff like this would be

nice.


So, for the mempool lib, what do you think about applying the

same

technique to the rte_mempool_debug_stats structure (which is an

array

indexed per lcore)... Two adjacent lcores heavily accessing their

local

mempool caches seems likely to me. But how heavy does the access

need to

be for this technique to be relevant?




No idea how heavy the accesses need to be for this to have a

noticable

effect. For things like debug stats, I wonder how worthwhile

making

such

a
change would be, but then again, any change would have very low

impact

too
in that case.


I just tried adding padding to some of the hot structures in our

own

application, and observed a significant performance improvement for
those.


So I think this technique should have higher visibility in DPDK by

adding a new cache macro to rte_common.h:


+1 to make more visibility in doc and adding a macro, good idea!





A worry I have is that for CPUs with large (in this context) N, you

will

end up with a lot of padding to avoid next-N-lines false sharing.

That

would be padding after, and in the general (non-array) case also

before,

the actual per-lcore data. A slight nuisance is also that those
prefetched lines of padding, will never contain anything useful, and
thus fetching them will always be a waste.


Out of curiosity, what is the largest N anyone here on the list is

aware of?




Padding/alignment may not be the only way to avoid HW-prefetcher-

induced

false sharing for per-lcore data structures.

What we are discussing here is organizing the statically allocated
per-lcore structs of a particular module in an array with the
appropriate padding/alignment. In this model, all data related to a
particular module is close (memory address/page-wise), but not so

close

to cause false sharing.

/* rte_a.c */

struct rte_a_state
{
int x;
   RTE_CACHE_GUARD;
} __rte_cache_aligned;

static struct rte_a_state a_states[RTE_MAX_LCORE];

/* rte_b.c */

struct rte_b_state
{
char y;
   char z;
   RTE_CACHE_GUARD;
} __rte_cache_aligned;


static struct rte_b_state b_states[RTE_MAX_LCORE];

What you would end up with in runtime when the linker has done its

job

is something that essentially looks like this (in memory):

struct {
struct rte_a_state a_states[RTE_MAX_LCORE];
struct rte_b_state b_states[RTE_MAX_LCORE];
};

You could consider turning it around, and keeping data (i.e., module
structs) related to a particular lcore, for all modules, close. In

other

words, keeping a per-lcore arrays of variable-sized elements.

So, something that will end up looking like this (in memory, not in

the

source code

RE: [PATCH] clarify purpose of empty cache lines

2023-09-04 Thread Morten Brørup
> From: Morten Brørup [mailto:m...@smartsharesystems.com]
> Sent: Monday, 4 September 2023 10.44
> 
> This patch introduces the generic RTE_CACHE_GUARD macro into the EAL, and
> replaces vaguely described empty cache lines in the rte_ring structure
> with this macro.
> 
> Although the implementation of the rte_ring structure assumes that the
> hardware speculatively prefetches 1 cache line, this number can be changed
> at build time by modifying RTE_CACHE_GUARD_LINES in rte_config.h.
> 
> The background and the RFC was discussed in this thread:
> http://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35D87B39@smartserver.s
> martshare.dk/
> 
> Signed-off-by: Morten Brørup 
> ---
>  config/rte_config.h  |  1 +
>  lib/eal/include/rte_common.h | 13 +
>  lib/ring/rte_ring_core.h |  6 +++---
>  3 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/config/rte_config.h b/config/rte_config.h
> index 400e44e3cf..cfdf787724 100644
> --- a/config/rte_config.h
> +++ b/config/rte_config.h
> @@ -37,6 +37,7 @@
>  #define RTE_MAX_TAILQ 32
>  #define RTE_LOG_DP_LEVEL RTE_LOG_INFO
>  #define RTE_MAX_VFIO_CONTAINERS 64
> +#define RTE_CACHE_GUARD_LINES 1
> 
>  /* bsd module defines */
>  #define RTE_CONTIGMEM_MAX_NUM_BUFS 64
> diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
> index 771c70f2c8..daf1866a32 100644
> --- a/lib/eal/include/rte_common.h
> +++ b/lib/eal/include/rte_common.h
> @@ -527,6 +527,19 @@ rte_is_aligned(const void * const __rte_restrict ptr,
> const unsigned int align)
>  /** Force minimum cache line alignment. */
>  #define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE)
> 
> +#define _RTE_CACHE_GUARD_HELPER2(unique) \
> + char cache_guard_ ## unique[RTE_CACHE_LINE_SIZE *
> RTE_CACHE_GUARD_LINES] \
> + __rte_cache_aligned
> +#define _RTE_CACHE_GUARD_HELPER1(unique) _RTE_CACHE_GUARD_HELPER2(unique)
> +/**
> + * Empty cache lines, to guard against false sharing-like effects
> + * on systems with a next-N-lines hardware prefetcher.
> + *
> + * Use as spacing between data accessed by different lcores,
> + * to prevent cache thrashing on hardware with speculative prefetching.
> + */
> +#define RTE_CACHE_GUARD _RTE_CACHE_GUARD_HELPER1(__COUNTER__)
> +
>  /*** PA/IOVA type definitions /
> 
>  /** Physical address */
> diff --git a/lib/ring/rte_ring_core.h b/lib/ring/rte_ring_core.h
> index d1e59bf9ad..327fdcf28f 100644
> --- a/lib/ring/rte_ring_core.h
> +++ b/lib/ring/rte_ring_core.h
> @@ -126,7 +126,7 @@ struct rte_ring {
>   uint32_t mask;   /**< Mask (size-1) of ring. */
>   uint32_t capacity;   /**< Usable size of ring */
> 
> - char pad0 __rte_cache_aligned; /**< empty cache line */
> + RTE_CACHE_GUARD;
> 
>   /** Ring producer status. */
>   union {
> @@ -135,7 +135,7 @@ struct rte_ring {
>   struct rte_ring_rts_headtail rts_prod;
>   }  __rte_cache_aligned;
> 
> - char pad1 __rte_cache_aligned; /**< empty cache line */
> + RTE_CACHE_GUARD;
> 
>   /** Ring consumer status. */
>   union {
> @@ -144,7 +144,7 @@ struct rte_ring {
>   struct rte_ring_rts_headtail rts_cons;
>   }  __rte_cache_aligned;
> 
> - char pad2 __rte_cache_aligned; /**< empty cache line */
> + RTE_CACHE_GUARD;
>  };
> 
>  #define RING_F_SP_ENQ 0x0001 /**< The default enqueue is "single-producer".
> */
> --
> 2.17.1

I asked Mattias Rönnblom in the RFC discussion thread to ACK this patch, which 
he then did [1], so I'm copying his ACK here for patchwork to see.

[1]: 
http://inbox.dpdk.org/dev/66b862bc-ca22-02da-069c-a82c2d52e...@lysator.liu.se/

Acked-by: Mattias Rönnblom