From: Nelio Laranjeiro <>

ConnectX-4 adapters to not have a constant indirection table size, which is
set at runtime from the number of RX queues. The maximum size is retrieved
using a hardware query and is normally 512.
Since the current RETA API cannot handle a variable size, any query/update
command causes it to be silently updated to RSS_INDIRECTION_TABLE_SIZE
entries regardless of the original size.

Also due to the underlying type of the configuration structure, the maximum
size is limited to RSS_INDIRECTION_TABLE_SIZE (currently 128, at most 256

A port stop/start must be done to apply the new RETA configuration.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at>
 drivers/net/mlx5/mlx5.c        |   8 +-
 drivers/net/mlx5/mlx5.h        |   7 ++
 drivers/net/mlx5/mlx5_ethdev.c |  29 ++++++++
 drivers/net/mlx5/mlx5_rss.c    | 163 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxq.c    |  53 ++------------
 drivers/net/mlx5/mlx5_utils.h  |  20 +++++
 6 files changed, 234 insertions(+), 46 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 9636588..5a95260 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -133,6 +133,8 @@ mlx5_dev_close(struct rte_eth_dev *dev)
+       if (priv->reta_idx != NULL)
+               rte_free(priv->reta_idx);
        memset(priv, 0, sizeof(*priv));
@@ -160,6 +162,8 @@ static const struct eth_dev_ops mlx5_dev_ops = {
        .mac_addr_remove = mlx5_mac_addr_remove,
        .mac_addr_add = mlx5_mac_addr_add,
        .mtu_set = mlx5_dev_set_mtu,
+       .reta_update = mlx5_dev_rss_reta_update,
+       .reta_query = mlx5_dev_rss_reta_query,
        .rss_hash_update = mlx5_rss_hash_update,
        .rss_hash_conf_get = mlx5_rss_hash_conf_get,
@@ -373,7 +377,9 @@ mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct 
rte_pci_device *pci_dev)
                DEBUG("L2 tunnel checksum offloads are %ssupported",
                      (priv->hw_csum_l2tun ? "" : "not "));

-               priv->ind_table_max_size = 
+               priv->ind_table_max_size =
+                       RTE_MIN((unsigned int)RSS_INDIRECTION_TABLE_SIZE,
                DEBUG("maximum RX indirection table size is %u",

diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 0daacc8..b84d31d 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -118,6 +118,8 @@ struct priv {
        /* RSS configuration array indexed by hash RX queue type. */
        struct rte_eth_rss_conf *(*rss_conf)[];
        struct rte_intr_handle intr_handle; /* Interrupt handler. */
+       unsigned int (*reta_idx)[]; /* RETA index table. */
+       unsigned int reta_idx_n; /* RETA index size. */
        rte_spinlock_t lock; /* Lock for control functions. */

@@ -184,6 +186,11 @@ int rss_hash_rss_conf_new_key(struct priv *, const uint8_t 
*, unsigned int,
 int mlx5_rss_hash_update(struct rte_eth_dev *, struct rte_eth_rss_conf *);
 int mlx5_rss_hash_conf_get(struct rte_eth_dev *, struct rte_eth_rss_conf *);
+int priv_rss_reta_index_resize(struct priv *, unsigned int);
+int mlx5_dev_rss_reta_query(struct rte_eth_dev *,
+                           struct rte_eth_rss_reta_entry64 *, uint16_t);
+int mlx5_dev_rss_reta_update(struct rte_eth_dev *,
+                            struct rte_eth_rss_reta_entry64 *, uint16_t);

 /* mlx5_rxmode.c */

diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 84e877c..1159fa3 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -410,6 +410,9 @@ dev_configure(struct rte_eth_dev *dev)
        struct priv *priv = dev->data->dev_private;
        unsigned int rxqs_n = dev->data->nb_rx_queues;
        unsigned int txqs_n = dev->data->nb_tx_queues;
+       unsigned int i;
+       unsigned int j;
+       unsigned int reta_idx_n;

        priv->rxqs = (void *)dev->data->rx_queues;
        priv->txqs = (void *)dev->data->tx_queues;
@@ -418,11 +421,31 @@ dev_configure(struct rte_eth_dev *dev)
                     (void *)dev, priv->txqs_n, txqs_n);
                priv->txqs_n = txqs_n;
+       if (rxqs_n > priv->ind_table_max_size) {
+               ERROR("cannot handle this many RX queues (%u)", rxqs_n);
+               return EINVAL;
+       }
        if (rxqs_n == priv->rxqs_n)
                return 0;
        INFO("%p: RX queues number update: %u -> %u",
             (void *)dev, priv->rxqs_n, rxqs_n);
        priv->rxqs_n = rxqs_n;
+       /* If the requested number of RX queues is not a power of two, use the
+        * maximum indirection table size for better balancing.
+        * The result is always rounded to the next power of two. */
+       reta_idx_n = (1 << log2above((rxqs_n & (rxqs_n - 1)) ?
+                                    priv->ind_table_max_size :
+                                    rxqs_n));
+       if (priv_rss_reta_index_resize(priv, reta_idx_n))
+               return ENOMEM;
+       /* When the number of RX queues is not a power of two, the remaining
+        * table entries are padded with reused WQs and hashes are not spread
+        * uniformly. */
+       for (i = 0, j = 0; (i != reta_idx_n); ++i) {
+               (*priv->reta_idx)[i] = j;
+               if (++j == rxqs_n)
+                       j = 0;
+       }
        return 0;

@@ -494,6 +517,12 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *info)
        if (priv_get_ifname(priv, &ifname) == 0)
                info->if_index = if_nametoindex(ifname);
+       /* FIXME: RETA update/query API expects the callee to know the size of
+        * the indirection table, for this PMD the size varies depending on
+        * the number of RX queues, it becomes impossible to find the correct
+        * size if it is not fixed.
+        * The API should be updated to solve this problem. */
+       info->reta_size = priv->ind_table_max_size;

diff --git a/drivers/net/mlx5/mlx5_rss.c b/drivers/net/mlx5/mlx5_rss.c
index bf19aca..7eb688a 100644
--- a/drivers/net/mlx5/mlx5_rss.c
+++ b/drivers/net/mlx5/mlx5_rss.c
@@ -211,3 +211,166 @@ mlx5_rss_hash_conf_get(struct rte_eth_dev *dev,
        return 0;
+ * Allocate/reallocate RETA index table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @praram reta_size
+ *   The size of the array to allocate.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+priv_rss_reta_index_resize(struct priv *priv, unsigned int reta_size)
+       void *mem;
+       unsigned int old_size = priv->reta_idx_n;
+       if (priv->reta_idx_n == reta_size)
+               return 0;
+       mem = rte_realloc(priv->reta_idx,
+                         reta_size * sizeof((*priv->reta_idx)[0]), 0);
+       if (!mem)
+               return ENOMEM;
+       priv->reta_idx = mem;
+       priv->reta_idx_n = reta_size;
+       if (old_size < reta_size)
+               memset(&(*priv->reta_idx)[old_size], 0,
+                      (reta_size - old_size) *
+                      sizeof((*priv->reta_idx)[0]));
+       return 0;
+ * Query RETA table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in, out] reta_conf
+ *   Pointer to the first RETA configuration structure.
+ * @param reta_size
+ *   Number of entries.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_dev_rss_reta_query(struct priv *priv,
+                       struct rte_eth_rss_reta_entry64 *reta_conf,
+                       unsigned int reta_size)
+       unsigned int idx;
+       unsigned int i;
+       int ret;
+       /* See RETA comment in mlx5_dev_infos_get(). */
+       ret = priv_rss_reta_index_resize(priv, priv->ind_table_max_size);
+       if (ret)
+               return ret;
+       /* Fill each entry of the table even if its bit is not set. */
+       for (idx = 0, i = 0; (i != reta_size); ++i) {
+               idx = i / RTE_RETA_GROUP_SIZE;
+               reta_conf[idx].reta[i % RTE_RETA_GROUP_SIZE] =
+                       (*priv->reta_idx)[i];
+       }
+       return 0;
+ * Update RETA table.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param[in] reta_conf
+ *   Pointer to the first RETA configuration structure.
+ * @param reta_size
+ *   Number of entries.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_dev_rss_reta_update(struct priv *priv,
+                        struct rte_eth_rss_reta_entry64 *reta_conf,
+                        unsigned int reta_size)
+       unsigned int idx;
+       unsigned int i;
+       unsigned int pos;
+       int ret;
+       /* See RETA comment in mlx5_dev_infos_get(). */
+       ret = priv_rss_reta_index_resize(priv, priv->ind_table_max_size);
+       if (ret)
+               return ret;
+       for (idx = 0, i = 0; (i != reta_size); ++i) {
+               idx = i / RTE_RETA_GROUP_SIZE;
+               pos = i % RTE_RETA_GROUP_SIZE;
+               if (((reta_conf[idx].mask >> i) & 0x1) == 0)
+                       continue;
+               assert(reta_conf[idx].reta[pos] < priv->rxqs_n);
+               (*priv->reta_idx)[i] = reta_conf[idx].reta[pos];
+       }
+       return 0;
+ * DPDK callback to get the RETA indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param reta_conf
+ *   Pointer to RETA configuration structure array.
+ * @param reta_size
+ *   Size of the RETA table.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+mlx5_dev_rss_reta_query(struct rte_eth_dev *dev,
+                       struct rte_eth_rss_reta_entry64 *reta_conf,
+                       uint16_t reta_size)
+       int ret;
+       struct priv *priv = dev->data->dev_private;
+       priv_lock(priv);
+       ret = priv_dev_rss_reta_query(priv, reta_conf, reta_size);
+       priv_unlock(priv);
+       return -ret;
+ * DPDK callback to update the RETA indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param reta_conf
+ *   Pointer to RETA configuration structure array.
+ * @param reta_size
+ *   Size of the RETA table.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+mlx5_dev_rss_reta_update(struct rte_eth_dev *dev,
+                        struct rte_eth_rss_reta_entry64 *reta_conf,
+                        uint16_t reta_size)
+       int ret;
+       struct priv *priv = dev->data->dev_private;
+       priv_lock(priv);
+       ret = priv_dev_rss_reta_update(priv, reta_conf, reta_size);
+       priv_unlock(priv);
+       return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 084bf41..3d7ae7e 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -259,26 +259,6 @@ hash_rxq_flow_attr(const struct hash_rxq *hash_rxq,

- * Return nearest power of two above input value.
- *
- * @param v
- *   Input value.
- *
- * @return
- *   Nearest power of two above input value.
- */
-static unsigned int
-log2above(unsigned int v)
-       unsigned int l;
-       unsigned int r;
-       for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
-               r |= (v & 1);
-       return (l + r);
  * Return the type corresponding to the n'th bit set.
  * @param table
@@ -360,14 +340,7 @@ priv_make_ind_table_init(struct priv *priv,
 priv_create_hash_rxqs(struct priv *priv)
-       /* If the requested number of WQs is not a power of two, use the
-        * maximum indirection table size for better balancing.
-        * The result is always rounded to the next power of two. */
-       unsigned int wqs_n =
-               (1 << log2above((priv->rxqs_n & (priv->rxqs_n - 1)) ?
-                               priv->ind_table_max_size :
-                               priv->rxqs_n));
-       struct ibv_exp_wq *wqs[wqs_n];
+       struct ibv_exp_wq *wqs[priv->reta_idx_n];
        struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
        unsigned int ind_tables_n =
                priv_make_ind_table_init(priv, &ind_table_init);
@@ -393,25 +366,15 @@ priv_create_hash_rxqs(struct priv *priv)
                      " indirection table cannot be created");
                return EINVAL;
-       if ((wqs_n < priv->rxqs_n) || (wqs_n > priv->ind_table_max_size)) {
-               ERROR("cannot handle this many RX queues (%u)", priv->rxqs_n);
-               err = ERANGE;
-               goto error;
-       }
-       if (wqs_n != priv->rxqs_n) {
+       if (priv->rxqs_n & (priv->rxqs_n - 1)) {
                INFO("%u RX queues are configured, consider rounding this"
                     " number to the next power of two for better balancing",
-               DEBUG("indirection table extended to assume %u WQs", wqs_n);
-       }
-       /* When the number of RX queues is not a power of two, the remaining
-        * table entries are padded with reused WQs and hashes are not spread
-        * uniformly. */
-       for (i = 0, j = 0; (i != wqs_n); ++i) {
-               wqs[i] = (*priv->rxqs)[j]->wq;
-               if (++j == priv->rxqs_n)
-                       j = 0;
+               DEBUG("indirection table extended to assume %u WQs",
+                     priv->reta_idx_n);
+       for (i = 0; (i != priv->reta_idx_n); ++i)
+               wqs[i] = (*priv->rxqs)[(*priv->reta_idx)[i]]->wq;
        /* Get number of hash RX queues to configure. */
        for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
                hash_rxqs_n += ind_table_init[i].hash_types_n;
@@ -436,8 +399,8 @@ priv_create_hash_rxqs(struct priv *priv)
                unsigned int ind_tbl_size = ind_table_init[i].max_size;
                struct ibv_exp_rwq_ind_table *ind_table;

-               if (wqs_n < ind_tbl_size)
-                       ind_tbl_size = wqs_n;
+               if (priv->reta_idx_n < ind_tbl_size)
+                       ind_tbl_size = priv->reta_idx_n;
                ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
                errno = 0;
                ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
diff --git a/drivers/net/mlx5/mlx5_utils.h b/drivers/net/mlx5/mlx5_utils.h
index f1fad18..9b5e86a 100644
--- a/drivers/net/mlx5/mlx5_utils.h
+++ b/drivers/net/mlx5/mlx5_utils.h
@@ -161,4 +161,24 @@ pmd_drv_log_basename(const char *s)
        snprintf(name, sizeof(name), __VA_ARGS__)

+ * Return nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+       unsigned int l;
+       unsigned int r;
+       for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+               r |= (v & 1);
+       return (l + r);
 #endif /* RTE_PMD_MLX5_UTILS_H_ */

Reply via email to