Tunnel Offload API provides hardware independent, unified model
to offload tunneled traffic. Key model elements are:
 - apply matches to both outer and inner packet headers
   during entire offload procedure;
 - restore outer header of partially offloaded packet;
 - model is implemented as a set of helper functions.

Implementation details:
* tunnel_offload PMD parameter must be set to 1 to enable the feature.
* application cannot use MARK and META flow actions whith tunnel.
* offload JUMP action is restricted to steering tunnel rule only.

Signed-off-by: Gregory Etelson <getel...@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viachesl...@nvidia.com>
---
v2:
* introduce MLX5 PMD API implementation
v3:
* bug fixes
---
 doc/guides/nics/mlx5.rst         |   3 +
 drivers/net/mlx5/linux/mlx5_os.c |  18 +
 drivers/net/mlx5/mlx5.c          |   8 +-
 drivers/net/mlx5/mlx5.h          |   3 +
 drivers/net/mlx5/mlx5_defs.h     |   2 +
 drivers/net/mlx5/mlx5_flow.c     | 678 ++++++++++++++++++++++++++++++-
 drivers/net/mlx5/mlx5_flow.h     | 173 +++++++-
 drivers/net/mlx5/mlx5_flow_dv.c  | 241 +++++++++--
 8 files changed, 1080 insertions(+), 46 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index b0614ae335..03eec3503d 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -815,6 +815,9 @@ Driver options
     24 bits. The actual supported width can be retrieved in runtime by
     series of rte_flow_validate() trials.
 
+  - 3, this engages tunnel offload mode. In E-Switch configuration, that
+    mode implicitly activates ``dv_xmeta_en=1``.
+
   +------+-----------+-----------+-------------+-------------+
   | Mode | ``MARK``  | ``META``  | ``META`` Tx | FDB/Through |
   +======+===========+===========+=============+=============+
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 188a6d4c38..70db822987 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -298,6 +298,12 @@ mlx5_alloc_shared_dr(struct mlx5_priv *priv)
                sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
        }
 #endif
+       if (!sh->tunnel_hub)
+               err = mlx5_alloc_tunnel_hub(sh);
+       if (err) {
+               DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
+               goto error;
+       }
        if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
                mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
                mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
@@ -344,6 +350,10 @@ mlx5_alloc_shared_dr(struct mlx5_priv *priv)
                mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
                sh->tag_table = NULL;
        }
+       if (sh->tunnel_hub) {
+               mlx5_release_tunnel_hub(sh, priv->dev_port);
+               sh->tunnel_hub = NULL;
+       }
        mlx5_free_table_hash_list(priv);
        return err;
 }
@@ -405,6 +415,10 @@ mlx5_os_free_shared_dr(struct mlx5_priv *priv)
                mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
                sh->tag_table = NULL;
        }
+       if (sh->tunnel_hub) {
+               mlx5_release_tunnel_hub(sh, priv->dev_port);
+               sh->tunnel_hub = NULL;
+       }
        mlx5_free_table_hash_list(priv);
 }
 
@@ -658,6 +672,10 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
                        strerror(rte_errno));
                goto error;
        }
+       if (config->dv_miss_info) {
+               if (switch_info->master || switch_info->representor)
+                       config->dv_xmeta_en = MLX5_XMETA_MODE_META16;
+       }
        mlx5_malloc_mem_select(config->sys_mem_en);
        sh = mlx5_alloc_shared_dev_ctx(spawn, config);
        if (!sh)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 01ead6e6af..88d843fc4e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1591,13 +1591,17 @@ mlx5_args_check(const char *key, const char *val, void 
*opaque)
        } else if (strcmp(MLX5_DV_XMETA_EN, key) == 0) {
                if (tmp != MLX5_XMETA_MODE_LEGACY &&
                    tmp != MLX5_XMETA_MODE_META16 &&
-                   tmp != MLX5_XMETA_MODE_META32) {
+                   tmp != MLX5_XMETA_MODE_META32 &&
+                   tmp != MLX5_XMETA_MODE_MISS_INFO) {
                        DRV_LOG(ERR, "invalid extensive "
                                     "metadata parameter");
                        rte_errno = EINVAL;
                        return -rte_errno;
                }
-               config->dv_xmeta_en = tmp;
+               if (tmp != MLX5_XMETA_MODE_MISS_INFO)
+                       config->dv_xmeta_en = tmp;
+               else
+                       config->dv_miss_info = 1;
        } else if (strcmp(MLX5_LACP_BY_USER, key) == 0) {
                config->lacp_by_user = !!tmp;
        } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bd91e167e0..b008531736 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -206,6 +206,7 @@ struct mlx5_dev_config {
        unsigned int rt_timestamp:1; /* realtime timestamp format. */
        unsigned int sys_mem_en:1; /* The default memory allocator. */
        unsigned int decap_en:1; /* Whether decap will be used or not. */
+       unsigned int dv_miss_info:1; /* restore packet after partial hw miss */
        struct {
                unsigned int enabled:1; /* Whether MPRQ is enabled. */
                unsigned int stride_num_n; /* Number of strides. */
@@ -632,6 +633,8 @@ struct mlx5_dev_ctx_shared {
        /* UAR same-page access control required in 32bit implementations. */
 #endif
        struct mlx5_hlist *flow_tbls;
+       struct rte_hash *flow_tbl_map; /* app group-to-flow table map */
+       struct mlx5_flow_tunnel_hub *tunnel_hub;
        /* Direct Rules tables for FDB, NIC TX+RX */
        void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
        void *pop_vlan_action; /* Pointer to DR pop VLAN action. */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 0df47391ee..41a7537d5e 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -165,6 +165,8 @@
 #define MLX5_XMETA_MODE_LEGACY 0
 #define MLX5_XMETA_MODE_META16 1
 #define MLX5_XMETA_MODE_META32 2
+/* Provide info on patrial hw miss. Implies MLX5_XMETA_MODE_META16 */
+#define MLX5_XMETA_MODE_MISS_INFO 3
 
 /* MLX5_TX_DB_NC supported values. */
 #define MLX5_TXDB_CACHED 0
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index ffa7646ca4..36c1aa4543 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_ip.h>
+#include <rte_hash.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
@@ -30,6 +31,18 @@
 #include "mlx5_flow_os.h"
 #include "mlx5_rxtx.h"
 
+static struct mlx5_flow_tunnel *
+mlx5_find_tunnel_id(struct rte_eth_dev *dev, uint32_t id);
+static void
+mlx5_flow_tunnel_free(struct rte_eth_dev *dev, struct mlx5_flow_tunnel 
*tunnel);
+static uint32_t
+tunnel_flow_group_to_flow_table(struct rte_eth_dev *dev,
+                               const struct mlx5_flow_tunnel *tunnel,
+                               uint32_t group, uint32_t *table,
+                               struct rte_flow_error *error);
+static const struct mlx5_flow_tbl_data_entry  *
+tunnel_mark_decode(struct rte_eth_dev *dev, uint32_t mark);
+
 /** Device flow drivers. */
 extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
 
@@ -220,6 +233,171 @@ static const struct rte_flow_expand_node 
mlx5_support_expansion[] = {
        },
 };
 
+struct tunnel_validation {
+       bool verdict;
+       const char *msg;
+};
+
+static inline struct tunnel_validation
+mlx5_flow_tunnel_validate(struct rte_eth_dev *dev,
+                         struct rte_flow_tunnel *tunnel)
+{
+       struct tunnel_validation tv;
+
+       if (!is_tunnel_offload_active(dev)) {
+               tv.msg = "tunnel offload was not activated";
+               goto err;
+       } else if (!tunnel) {
+               tv.msg = "no application tunnel";
+               goto err;
+       }
+
+       switch (tunnel->type) {
+       default:
+               tv.msg = "unsupported tunnel type";
+               goto err;
+       case RTE_FLOW_ITEM_TYPE_VXLAN:
+               break;
+       }
+
+       tv.verdict = true;
+       return tv;
+
+err:
+       tv.verdict = false;
+       return tv;
+}
+
+static int
+mlx5_flow_tunnel_decap_set(struct rte_eth_dev *dev,
+                   struct rte_flow_tunnel *app_tunnel,
+                   struct rte_flow_action **actions,
+                   uint32_t *num_of_actions,
+                   struct rte_flow_error *error)
+{
+       int ret;
+       struct mlx5_flow_tunnel *tunnel;
+       struct tunnel_validation tv;
+
+       tv = mlx5_flow_tunnel_validate(dev, app_tunnel);
+       if (!tv.verdict)
+               return rte_flow_error_set(error, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+                                         tv.msg);
+       ret = mlx5_get_flow_tunnel(dev, app_tunnel, &tunnel);
+       if (ret < 0) {
+               return rte_flow_error_set(error, ret,
+                                         RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+                                         "failed to initialize pmd tunnel");
+       }
+       *actions = &tunnel->action;
+       *num_of_actions = 1;
+       return 0;
+}
+
+static int
+mlx5_flow_tunnel_match(struct rte_eth_dev *dev,
+                      struct rte_flow_tunnel *app_tunnel,
+                      struct rte_flow_item **items,
+                      uint32_t *num_of_items,
+                      struct rte_flow_error *error)
+{
+       int ret;
+       struct mlx5_flow_tunnel *tunnel;
+       struct tunnel_validation tv;
+
+       tv = mlx5_flow_tunnel_validate(dev, app_tunnel);
+       if (!tv.verdict)
+               return rte_flow_error_set(error, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+                                         tv.msg);
+       ret = mlx5_get_flow_tunnel(dev, app_tunnel, &tunnel);
+       if (ret < 0) {
+               return rte_flow_error_set(error, ret,
+                                         RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+                                         "failed to initialize pmd tunnel");
+       }
+       *items = &tunnel->item;
+       *num_of_items = 1;
+       return 0;
+}
+
+static int
+mlx5_flow_item_release(struct rte_eth_dev *dev,
+                      struct rte_flow_item *pmd_items,
+                      uint32_t num_items, struct rte_flow_error *err)
+{
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_tunnel *tun;
+
+       LIST_FOREACH(tun, &thub->tunnels, chain) {
+               if (&tun->item == pmd_items)
+                       break;
+       }
+       if (!tun || num_items != 1)
+               return rte_flow_error_set(err, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+                                         "invalid argument");
+       if (!__atomic_sub_fetch(&tun->refctn, 1, __ATOMIC_RELAXED))
+               mlx5_flow_tunnel_free(dev, tun);
+       return 0;
+}
+
+static int
+mlx5_flow_action_release(struct rte_eth_dev *dev,
+                        struct rte_flow_action *pmd_actions,
+                        uint32_t num_actions, struct rte_flow_error *err)
+{
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_tunnel *tun;
+
+       LIST_FOREACH(tun, &thub->tunnels, chain) {
+               if (&tun->action == pmd_actions)
+                       break;
+       }
+       if (!tun || num_actions != 1)
+               return rte_flow_error_set(err, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+                                         "invalid argument");
+       if (!__atomic_sub_fetch(&tun->refctn, 1, __ATOMIC_RELAXED))
+               mlx5_flow_tunnel_free(dev, tun);
+
+       return 0;
+}
+
+static int
+mlx5_flow_tunnel_get_restore_info(struct rte_eth_dev *dev,
+                                 struct rte_mbuf *m,
+                                 struct rte_flow_restore_info *info,
+                                 struct rte_flow_error *err)
+{
+       uint64_t ol_flags = m->ol_flags;
+       const struct mlx5_flow_tbl_data_entry *tble;
+       const uint64_t mask = PKT_RX_FDIR | PKT_RX_FDIR_ID;
+
+       if ((ol_flags & mask) != mask)
+               goto err;
+       tble = tunnel_mark_decode(dev, m->hash.fdir.hi);
+       if (!tble) {
+               DRV_LOG(DEBUG, "port %u invalid miss tunnel mark %#x",
+                       dev->data->port_id, m->hash.fdir.hi);
+               goto err;
+       }
+       MLX5_ASSERT(tble->tunnel);
+       memcpy(&info->tunnel, &tble->tunnel->app_tunnel, sizeof(info->tunnel));
+       info->group_id = tble->group_id;
+       info->flags = RTE_FLOW_RESTORE_INFO_TUNNEL |
+                     RTE_FLOW_RESTORE_INFO_GROUP_ID |
+                     RTE_FLOW_RESTORE_INFO_ENCAPSULATED;
+
+       return 0;
+
+err:
+       return rte_flow_error_set(err, EINVAL,
+                                 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+                                 "failed to get restore info");
+}
+
 static const struct rte_flow_ops mlx5_flow_ops = {
        .validate = mlx5_flow_validate,
        .create = mlx5_flow_create,
@@ -229,6 +407,11 @@ static const struct rte_flow_ops mlx5_flow_ops = {
        .query = mlx5_flow_query,
        .dev_dump = mlx5_flow_dev_dump,
        .get_aged_flows = mlx5_flow_get_aged_flows,
+       .tunnel_decap_set = mlx5_flow_tunnel_decap_set,
+       .tunnel_match = mlx5_flow_tunnel_match,
+       .action_release = mlx5_flow_action_release,
+       .item_release = mlx5_flow_item_release,
+       .get_restore_info = mlx5_flow_tunnel_get_restore_info,
 };
 
 /* Convert FDIR request to Generic flow. */
@@ -3524,6 +3707,136 @@ flow_hairpin_split(struct rte_eth_dev *dev,
        return 0;
 }
 
+__extension__
+union tunnel_offload_mark {
+       uint32_t val;
+       struct {
+               uint32_t app_reserve:8;
+               uint32_t table_id:15;
+               uint32_t transfer:1;
+               uint32_t _unused_:8;
+       };
+};
+
+struct tunnel_default_miss_ctx {
+       uint16_t *queue;
+       __extension__
+       union {
+               struct rte_flow_action_rss action_rss;
+               struct rte_flow_action_queue miss_queue;
+               struct rte_flow_action_jump miss_jump;
+               uint8_t raw[0];
+       };
+};
+
+static int
+flow_tunnel_add_default_miss(struct rte_eth_dev *dev,
+                            struct rte_flow *flow,
+                            const struct rte_flow_attr *attr,
+                            const struct rte_flow_action *app_actions,
+                            uint32_t flow_idx,
+                            struct tunnel_default_miss_ctx *ctx,
+                            struct rte_flow_error *error)
+{
+       struct mlx5_flow *dev_flow;
+       struct rte_flow_attr miss_attr = *attr;
+       const struct mlx5_flow_tunnel *tunnel = app_actions[0].conf;
+       const struct rte_flow_item miss_items[2] = {
+               {
+                       .type = RTE_FLOW_ITEM_TYPE_ETH,
+                       .spec = NULL,
+                       .last = NULL,
+                       .mask = NULL
+               },
+               {
+                       .type = RTE_FLOW_ITEM_TYPE_END,
+                       .spec = NULL,
+                       .last = NULL,
+                       .mask = NULL
+               }
+       };
+       union tunnel_offload_mark mark_id;
+       struct rte_flow_action_mark miss_mark;
+       struct rte_flow_action miss_actions[3] = {
+               [0] = { .type = RTE_FLOW_ACTION_TYPE_MARK, .conf = &miss_mark },
+               [2] = { .type = RTE_FLOW_ACTION_TYPE_END,  .conf = NULL }
+       };
+       const struct rte_flow_action_jump *jump_data;
+       uint32_t i, flow_table = 0; /* prevent compilation warning */
+       int ret;
+
+       if (!attr->transfer) {
+               struct mlx5_priv *priv = dev->data->dev_private;
+               uint32_t q_size;
+
+               miss_actions[1].type = RTE_FLOW_ACTION_TYPE_RSS;
+               q_size = priv->reta_idx_n * sizeof(ctx->queue[0]);
+               ctx->queue = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO, q_size,
+                                        0, SOCKET_ID_ANY);
+               if (!ctx->queue)
+                       return rte_flow_error_set
+                               (error, ENOMEM,
+                               RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+                               NULL, "invalid default miss RSS");
+               ctx->action_rss.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+               ctx->action_rss.level = 0,
+               ctx->action_rss.types = priv->rss_conf.rss_hf,
+               ctx->action_rss.key_len = priv->rss_conf.rss_key_len,
+               ctx->action_rss.queue_num = priv->reta_idx_n,
+               ctx->action_rss.key = priv->rss_conf.rss_key,
+               ctx->action_rss.queue = ctx->queue;
+               if (!priv->reta_idx_n || !priv->rxqs_n)
+                       return rte_flow_error_set
+                               (error, EINVAL,
+                               RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+                               NULL, "invalid port configuration");
+               if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG))
+                       ctx->action_rss.types = 0;
+               for (i = 0; i != priv->reta_idx_n; ++i)
+                       ctx->queue[i] = (*priv->reta_idx)[i];
+       } else {
+               miss_actions[1].type = RTE_FLOW_ACTION_TYPE_JUMP;
+               ctx->miss_jump.group = MLX5_TNL_MISS_FDB_JUMP_GRP;
+       }
+       miss_actions[1].conf = (typeof(miss_actions[1].conf))ctx->raw;
+       for (; app_actions->type != RTE_FLOW_ACTION_TYPE_JUMP; app_actions++);
+       jump_data = app_actions->conf;
+       miss_attr.priority = MLX5_TNL_MISS_RULE_PRIORITY;
+       miss_attr.group = jump_data->group;
+       ret = tunnel_flow_group_to_flow_table(dev, tunnel, jump_data->group,
+                                             &flow_table, error);
+       if (ret)
+               return rte_flow_error_set(error, EINVAL,
+                                         RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+                                         NULL, "invalid tunnel id");
+       mark_id.app_reserve = 0;
+       mark_id.table_id = tunnel_flow_tbl_to_id(flow_table);
+       mark_id.transfer = !!attr->transfer;
+       mark_id._unused_ = 0;
+       miss_mark.id = mark_id.val;
+       dev_flow = flow_drv_prepare(dev, flow, &miss_attr,
+                                   miss_items, miss_actions, flow_idx, error);
+       if (!dev_flow)
+               return -rte_errno;
+       dev_flow->flow = flow;
+       dev_flow->external = true;
+       dev_flow->tunnel = tunnel;
+       /* Subflow object was created, we must include one in the list. */
+       SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+                     dev_flow->handle, next);
+       DRV_LOG(DEBUG,
+               "port %u tunnel type=%d id=%u miss rule priority=%u group=%u",
+               dev->data->port_id, tunnel->app_tunnel.type,
+               tunnel->tunnel_id, miss_attr.priority, miss_attr.group);
+       ret = flow_drv_translate(dev, dev_flow, &miss_attr, miss_items,
+                                 miss_actions, error);
+       if (!ret)
+               ret = flow_mreg_update_copy_table(dev, flow, miss_actions,
+                                                 error);
+
+       return ret;
+}
+
 /**
  * The last stage of splitting chain, just creates the subflow
  * without any modification.
@@ -4296,6 +4609,27 @@ flow_create_split_outer(struct rte_eth_dev *dev,
        return ret;
 }
 
+static struct mlx5_flow_tunnel *
+flow_tunnel_from_rule(struct rte_eth_dev *dev,
+                     const struct rte_flow_attr *attr,
+                     const struct rte_flow_item items[],
+                     const struct rte_flow_action actions[])
+{
+       struct mlx5_flow_tunnel *tunnel;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+       if (is_flow_tunnel_match_rule(dev, attr, items, actions))
+               tunnel = (struct mlx5_flow_tunnel *)items[0].spec;
+       else if (is_flow_tunnel_steer_rule(dev, attr, items, actions))
+               tunnel = (struct mlx5_flow_tunnel *)actions[0].conf;
+       else
+               tunnel = NULL;
+#pragma GCC diagnostic pop
+
+       return tunnel;
+}
+
 /**
  * Create a flow and add it to @p list.
  *
@@ -4356,6 +4690,8 @@ flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
        int hairpin_flow;
        uint32_t hairpin_id = 0;
        struct rte_flow_attr attr_tx = { .priority = 0 };
+       struct mlx5_flow_tunnel *tunnel;
+       struct tunnel_default_miss_ctx default_miss_ctx = { 0, };
        int ret;
 
        hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
@@ -4430,6 +4766,19 @@ flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
                                              error);
                if (ret < 0)
                        goto error;
+               if (is_flow_tunnel_steer_rule(dev, attr,
+                                             buf->entry[i].pattern,
+                                             p_actions_rx)) {
+                       ret = flow_tunnel_add_default_miss(dev, flow, attr,
+                                                          p_actions_rx,
+                                                          idx,
+                                                          &default_miss_ctx,
+                                                          error);
+                       if (ret < 0) {
+                               mlx5_free(default_miss_ctx.queue);
+                               goto error;
+                       }
+               }
        }
        /* Create the tx flow. */
        if (hairpin_flow) {
@@ -4484,6 +4833,13 @@ flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
        priv->flow_idx = priv->flow_nested_idx;
        if (priv->flow_nested_idx)
                priv->flow_nested_idx = 0;
+       tunnel = flow_tunnel_from_rule(dev, attr, items, actions);
+       if (tunnel) {
+               flow->tunnel = 1;
+               flow->tunnel_id = tunnel->tunnel_id;
+               __atomic_add_fetch(&tunnel->refctn, 1, __ATOMIC_RELAXED);
+               mlx5_free(default_miss_ctx.queue);
+       }
        return idx;
 error:
        MLX5_ASSERT(flow);
@@ -4603,6 +4959,7 @@ mlx5_flow_create(struct rte_eth_dev *dev,
                                   "port not started");
                return NULL;
        }
+
        return (void *)(uintptr_t)flow_list_create(dev, &priv->flows,
                                  attr, items, actions, true, error);
 }
@@ -4657,6 +5014,13 @@ flow_list_destroy(struct rte_eth_dev *dev, uint32_t 
*list,
                }
        }
        mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+       if (flow->tunnel) {
+               struct mlx5_flow_tunnel *tunnel;
+               tunnel = mlx5_find_tunnel_id(dev, flow->tunnel_id);
+               RTE_VERIFY(tunnel);
+               if (!__atomic_sub_fetch(&tunnel->refctn, 1, __ATOMIC_RELAXED))
+                       mlx5_flow_tunnel_free(dev, tunnel);
+       }
 }
 
 /**
@@ -6131,19 +6495,122 @@ mlx5_flow_async_pool_query_handle(struct 
mlx5_dev_ctx_shared *sh,
        sh->cmng.pending_queries--;
 }
 
+static const struct mlx5_flow_tbl_data_entry  *
+tunnel_mark_decode(struct rte_eth_dev *dev, uint32_t mark)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       struct mlx5_dev_ctx_shared *sh = priv->sh;
+       struct mlx5_hlist_entry *he;
+       union tunnel_offload_mark mbits = { .val = mark };
+       union mlx5_flow_tbl_key table_key = {
+               {
+                       .table_id = tunnel_id_to_flow_tbl(mbits.table_id),
+                       .reserved = 0,
+                       .domain = !!mbits.transfer,
+                       .direction = 0,
+               }
+       };
+       he = mlx5_hlist_lookup(sh->flow_tbls, table_key.v64);
+       return he ?
+              container_of(he, struct mlx5_flow_tbl_data_entry, entry) : NULL;
+}
+
+static uint32_t
+tunnel_flow_group_to_flow_table(struct rte_eth_dev *dev,
+                               const struct mlx5_flow_tunnel *tunnel,
+                               uint32_t group, uint32_t *table,
+                               struct rte_flow_error *error)
+{
+       struct mlx5_hlist_entry *he;
+       struct tunnel_tbl_entry *tte;
+       union tunnel_tbl_key key = {
+               .tunnel_id = tunnel ? tunnel->tunnel_id : 0,
+               .group = group
+       };
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_hlist *group_hash;
+
+       group_hash = tunnel ? tunnel->groups : thub->groups;
+       he = mlx5_hlist_lookup(group_hash, key.val);
+       if (!he) {
+               int ret;
+               tte = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO,
+                                 sizeof(*tte), 0,
+                                 SOCKET_ID_ANY);
+               if (!tte)
+                       goto err;
+               tte->hash.key = key.val;
+               ret = mlx5_flow_id_get(thub->table_ids, &tte->flow_table);
+               if (ret) {
+                       mlx5_free(tte);
+                       goto err;
+               }
+               tte->flow_table = tunnel_id_to_flow_tbl(tte->flow_table);
+               mlx5_hlist_insert(group_hash, &tte->hash);
+       } else {
+               tte = container_of(he, typeof(*tte), hash);
+       }
+       *table = tte->flow_table;
+       DRV_LOG(DEBUG, "port %u tunnel %u group=%#x table=%#x",
+               dev->data->port_id, key.tunnel_id, group, *table);
+       return 0;
+
+err:
+       return rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+                                 NULL, "tunnel group index not supported");
+}
+
+static int
+flow_group_to_table(uint32_t port_id, uint32_t group, uint32_t *table,
+                   struct flow_grp_info grp_info, struct rte_flow_error *error)
+{
+       if (grp_info.transfer && grp_info.external && grp_info.fdb_def_rule) {
+               if (group == UINT32_MAX)
+                       return rte_flow_error_set
+                                               (error, EINVAL,
+                                                RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+                                                NULL,
+                                                "group index not supported");
+               *table = group + 1;
+       } else {
+               *table = group;
+       }
+       DRV_LOG(DEBUG, "port %u group=%#x table=%#x", port_id, group, *table);
+       return 0;
+}
+
 /**
  * Translate the rte_flow group index to HW table value.
  *
- * @param[in] attributes
- *   Pointer to flow attributes
- * @param[in] external
- *   Value is part of flow rule created by request external to PMD.
+ * If tunnel offload is disabled, all group ids coverted to flow table
+ * id using the standard method.
+ * If tunnel offload is enabled, group id can be converted using the
+ * standard or tunnel conversion method. Group conversion method
+ * selection depends on flags in `grp_info` parameter:
+ * - Internal (grp_info.external == 0) groups conversion uses the
+ *   standard method.
+ * - Group ids in JUMP action converted with the tunnel conversion.
+ * - Group id in rule attribute conversion depends on a rule type and
+ *   group id value:
+ *   ** non zero group attributes converted with the tunnel method
+ *   ** zero group attribute in non-tunnel rule is converted using the
+ *      standard method - there's only one root table
+ *   ** zero group attribute in steer tunnel rule is converted with the
+ *      standard method - single root table
+ *   ** zero group attribute in match tunnel rule is a special OvS
+ *      case: that value is used for portability reasons. That group
+ *      id is converted with the tunnel conversion method.
+ *
+ * @param[in] dev
+ *   Port device
+ * @param[in] tunnel
+ *   PMD tunnel offload object
  * @param[in] group
  *   rte_flow group index value.
- * @param[out] fdb_def_rule
- *   Whether fdb jump to table 1 is configured.
  * @param[out] table
  *   HW table value.
+ * @param[in] grp_info
+ *   flags used for conversion
  * @param[out] error
  *   Pointer to error structure.
  *
@@ -6151,22 +6618,34 @@ mlx5_flow_async_pool_query_handle(struct 
mlx5_dev_ctx_shared *sh,
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, bool external,
-                        uint32_t group, bool fdb_def_rule, uint32_t *table,
+mlx5_flow_group_to_table(struct rte_eth_dev *dev,
+                        const struct mlx5_flow_tunnel *tunnel,
+                        uint32_t group, uint32_t *table,
+                        struct flow_grp_info grp_info,
                         struct rte_flow_error *error)
 {
-       if (attributes->transfer && external && fdb_def_rule) {
-               if (group == UINT32_MAX)
-                       return rte_flow_error_set
-                                               (error, EINVAL,
-                                                RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
-                                                NULL,
-                                                "group index not supported");
-               *table = group + 1;
+       int ret;
+       bool standard_translation;
+
+       if (is_tunnel_offload_active(dev)) {
+               standard_translation = !grp_info.external ||
+                                       grp_info.std_tbl_fix;
        } else {
-               *table = group;
+               standard_translation = true;
        }
-       return 0;
+       DRV_LOG(DEBUG,
+               "port %u group=%#x transfer=%d external=%d fdb_def_rule=%d 
translate=%s",
+               dev->data->port_id, group, grp_info.transfer,
+               grp_info.external, grp_info.fdb_def_rule,
+               standard_translation ? "STANDARD" : "TUNNEL");
+       if (standard_translation)
+               ret = flow_group_to_table(dev->data->port_id, group, table,
+                                         grp_info, error);
+       else
+               ret = tunnel_flow_group_to_flow_table(dev, tunnel, group,
+                                                     table, error);
+
+       return ret;
 }
 
 /**
@@ -6305,3 +6784,166 @@ mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void 
**contexts,
                 dev->data->port_id);
        return -ENOTSUP;
 }
+
+static void
+mlx5_flow_tunnel_free(struct rte_eth_dev *dev,
+                     struct mlx5_flow_tunnel *tunnel)
+{
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_id_pool *id_pool = thub->tunnel_ids;
+
+       DRV_LOG(DEBUG, "port %u release pmd tunnel id=0x%x",
+               dev->data->port_id, tunnel->tunnel_id);
+       RTE_VERIFY(!__atomic_load_n(&tunnel->refctn, __ATOMIC_RELAXED));
+       LIST_REMOVE(tunnel, chain);
+       mlx5_flow_id_release(id_pool, tunnel->tunnel_id);
+       mlx5_hlist_destroy(tunnel->groups, NULL, NULL);
+       mlx5_free(tunnel);
+}
+
+static struct mlx5_flow_tunnel *
+mlx5_find_tunnel_id(struct rte_eth_dev *dev, uint32_t id)
+{
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_tunnel *tun;
+
+       LIST_FOREACH(tun, &thub->tunnels, chain) {
+               if (tun->tunnel_id == id)
+                       break;
+       }
+
+       return tun;
+}
+
+static struct mlx5_flow_tunnel *
+mlx5_flow_tunnel_allocate(struct rte_eth_dev *dev,
+                         const struct rte_flow_tunnel *app_tunnel)
+{
+       int ret;
+       struct mlx5_flow_tunnel *tunnel;
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_id_pool *id_pool = thub->tunnel_ids;
+       uint32_t id;
+
+       ret = mlx5_flow_id_get(id_pool, &id);
+       if (ret)
+               return NULL;
+       /**
+        * mlx5 flow tunnel is an auxlilary data structure
+        * It's not part of IO. No need to allocate it from
+        * huge pages pools dedicated for IO
+        */
+       tunnel = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO, sizeof(*tunnel),
+                            0, SOCKET_ID_ANY);
+       if (!tunnel) {
+               mlx5_flow_id_pool_release(id_pool);
+               return NULL;
+       }
+       tunnel->groups = mlx5_hlist_create("tunnel groups", 1024);
+       if (!tunnel->groups) {
+               mlx5_flow_id_pool_release(id_pool);
+               mlx5_free(tunnel);
+               return NULL;
+       }
+       /* initiate new PMD tunnel */
+       memcpy(&tunnel->app_tunnel, app_tunnel, sizeof(*app_tunnel));
+       tunnel->tunnel_id = id;
+       tunnel->action.type = MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET;
+       tunnel->action.conf = tunnel;
+       tunnel->item.type = MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL;
+       tunnel->item.spec = tunnel;
+       tunnel->item.last = NULL;
+       tunnel->item.mask = NULL;
+
+       DRV_LOG(DEBUG, "port %u new pmd tunnel id=0x%x",
+               dev->data->port_id, tunnel->tunnel_id);
+
+       return tunnel;
+}
+
+int
+mlx5_get_flow_tunnel(struct rte_eth_dev *dev,
+                    const struct rte_flow_tunnel *app_tunnel,
+                    struct mlx5_flow_tunnel **tunnel)
+{
+       int ret;
+       struct mlx5_flow_tunnel_hub *thub = mlx5_tunnel_hub(dev);
+       struct mlx5_flow_tunnel *tun;
+
+       LIST_FOREACH(tun, &thub->tunnels, chain) {
+               if (!memcmp(app_tunnel, &tun->app_tunnel,
+                           sizeof(*app_tunnel))) {
+                       *tunnel = tun;
+                       ret = 0;
+                       break;
+               }
+       }
+       if (!tun) {
+               tun = mlx5_flow_tunnel_allocate(dev, app_tunnel);
+               if (tun) {
+                       LIST_INSERT_HEAD(&thub->tunnels, tun, chain);
+                       *tunnel = tun;
+               } else {
+                       ret = -ENOMEM;
+               }
+       }
+       if (tun)
+               __atomic_add_fetch(&tun->refctn, 1, __ATOMIC_RELAXED);
+
+       return ret;
+}
+
+void mlx5_release_tunnel_hub(struct mlx5_dev_ctx_shared *sh, uint16_t port_id)
+{
+       struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+
+       if (!thub)
+               return;
+       if (!LIST_EMPTY(&thub->tunnels))
+               DRV_LOG(WARNING, "port %u tunnels present\n", port_id);
+       mlx5_flow_id_pool_release(thub->tunnel_ids);
+       mlx5_flow_id_pool_release(thub->table_ids);
+       mlx5_hlist_destroy(thub->groups, NULL, NULL);
+       mlx5_free(thub);
+}
+
+int mlx5_alloc_tunnel_hub(struct mlx5_dev_ctx_shared *sh)
+{
+       int err;
+       struct mlx5_flow_tunnel_hub *thub;
+
+       thub = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO, sizeof(*thub),
+                          0, SOCKET_ID_ANY);
+       if (!thub)
+               return -ENOMEM;
+       LIST_INIT(&thub->tunnels);
+       thub->tunnel_ids = mlx5_flow_id_pool_alloc(MLX5_MAX_TUNNELS);
+       if (!thub->tunnel_ids) {
+               err = -rte_errno;
+               goto err;
+       }
+       thub->table_ids = mlx5_flow_id_pool_alloc(MLX5_MAX_TABLES);
+       if (!thub->table_ids) {
+               err = -rte_errno;
+               goto err;
+       }
+       thub->groups = mlx5_hlist_create("flow groups", MLX5_MAX_TABLES);
+       if (!thub->groups) {
+               err = -rte_errno;
+               goto err;
+       }
+       sh->tunnel_hub = thub;
+
+       return 0;
+
+err:
+       if (thub->groups)
+               mlx5_hlist_destroy(thub->groups, NULL, NULL);
+       if (thub->table_ids)
+               mlx5_flow_id_pool_release(thub->table_ids);
+       if (thub->tunnel_ids)
+               mlx5_flow_id_pool_release(thub->tunnel_ids);
+       if (thub)
+               mlx5_free(thub);
+       return err;
+}
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 279daf21f5..8691db16ab 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -26,6 +26,7 @@ enum mlx5_rte_flow_item_type {
        MLX5_RTE_FLOW_ITEM_TYPE_TAG,
        MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
        MLX5_RTE_FLOW_ITEM_TYPE_VLAN,
+       MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL,
 };
 
 /* Private (internal) rte flow actions. */
@@ -35,6 +36,7 @@ enum mlx5_rte_flow_action_type {
        MLX5_RTE_FLOW_ACTION_TYPE_MARK,
        MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
        MLX5_RTE_FLOW_ACTION_TYPE_DEFAULT_MISS,
+       MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET,
 };
 
 /* Matches on selected register. */
@@ -196,6 +198,8 @@ enum mlx5_feature_name {
 #define MLX5_FLOW_ACTION_SET_IPV6_DSCP (1ull << 33)
 #define MLX5_FLOW_ACTION_AGE (1ull << 34)
 #define MLX5_FLOW_ACTION_DEFAULT_MISS (1ull << 35)
+#define MLX5_FLOW_ACTION_TUNNEL_SET (1ull << 36)
+#define MLX5_FLOW_ACTION_TUNNEL_MATCH (1ull << 37)
 
 #define MLX5_FLOW_FATE_ACTIONS \
        (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \
@@ -517,6 +521,10 @@ struct mlx5_flow_tbl_data_entry {
        struct mlx5_flow_dv_jump_tbl_resource jump;
        /**< jump resource, at most one for each table created. */
        uint32_t idx; /**< index for the indexed mempool. */
+       /**< tunnel offload */
+       const struct mlx5_flow_tunnel *tunnel;
+       uint32_t group_id;
+       bool external;
 };
 
 /* Verbs specification header. */
@@ -695,6 +703,7 @@ struct mlx5_flow {
        };
        struct mlx5_flow_handle *handle;
        uint32_t handle_idx; /* Index of the mlx5 flow handle memory. */
+       const struct mlx5_flow_tunnel *tunnel;
 };
 
 /* Flow meter state. */
@@ -840,6 +849,112 @@ struct mlx5_fdir_flow {
 
 #define HAIRPIN_FLOW_ID_BITS 28
 
+#define MLX5_MAX_TUNNELS 256
+#define MLX5_TNL_MISS_RULE_PRIORITY 3
+#define MLX5_TNL_MISS_FDB_JUMP_GRP  0xfaac
+
+/*
+ * When tunnel offload is active, all JUMP group ids are converted
+ * using the same method. That conversion is applied both to tunnel and
+ * regular rule types.
+ * Group ids used in tunnel rules are relative to it's tunnel (!).
+ * Application can create number of steer rules, using the same
+ * tunnel, with different group id in each rule.
+ * Each tunnel stores its groups internally in PMD tunnel object.
+ * Groups used in regular rules do not belong to any tunnel and are stored
+ * in tunnel hub.
+ */
+
+struct mlx5_flow_tunnel {
+       LIST_ENTRY(mlx5_flow_tunnel) chain;
+       struct rte_flow_tunnel app_tunnel;      /** app tunnel copy */
+       uint32_t tunnel_id;                     /** unique tunnel ID */
+       uint32_t refctn;
+       struct rte_flow_action action;
+       struct rte_flow_item item;
+       struct mlx5_hlist *groups;              /** tunnel groups */
+};
+
+/** PMD tunnel related context */
+struct mlx5_flow_tunnel_hub {
+       LIST_HEAD(, mlx5_flow_tunnel) tunnels;
+       struct mlx5_flow_id_pool *tunnel_ids;
+       struct mlx5_flow_id_pool *table_ids;
+       struct mlx5_hlist *groups;              /** non tunnel groups */
+};
+
+/* convert jump group to flow table ID in tunnel rules */
+struct tunnel_tbl_entry {
+       struct mlx5_hlist_entry hash;
+       uint32_t flow_table;
+};
+
+static inline uint32_t
+tunnel_id_to_flow_tbl(uint32_t id)
+{
+       return id | (1u << 16);
+}
+
+static inline uint32_t
+tunnel_flow_tbl_to_id(uint32_t flow_tbl)
+{
+       return flow_tbl & ~(1u << 16);
+}
+
+union tunnel_tbl_key {
+       uint64_t val;
+       struct {
+               uint32_t tunnel_id;
+               uint32_t group;
+       };
+};
+
+static inline struct mlx5_flow_tunnel_hub *
+mlx5_tunnel_hub(struct rte_eth_dev *dev)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       return priv->sh->tunnel_hub;
+}
+
+static inline bool
+is_tunnel_offload_active(struct rte_eth_dev *dev)
+{
+       struct mlx5_priv *priv = dev->data->dev_private;
+       return !!priv->config.dv_miss_info;
+}
+
+static inline bool
+is_flow_tunnel_match_rule(__rte_unused struct rte_eth_dev *dev,
+                         __rte_unused const struct rte_flow_attr *attr,
+                         __rte_unused const struct rte_flow_item items[],
+                         __rte_unused const struct rte_flow_action actions[])
+{
+       return (items[0].type == (typeof(items[0].type))
+                                MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL);
+}
+
+static inline bool
+is_flow_tunnel_steer_rule(__rte_unused struct rte_eth_dev *dev,
+                         __rte_unused const struct rte_flow_attr *attr,
+                         __rte_unused const struct rte_flow_item items[],
+                         __rte_unused const struct rte_flow_action actions[])
+{
+       return (actions[0].type == (typeof(actions[0].type))
+                                  MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET);
+}
+
+static inline const struct mlx5_flow_tunnel *
+flow_actions_to_tunnel(const struct rte_flow_action actions[])
+{
+       return actions[0].conf;
+}
+
+static inline const struct mlx5_flow_tunnel *
+flow_items_to_tunnel(const struct rte_flow_item items[])
+{
+       return items[0].spec;
+}
+
 /* Flow structure. */
 struct rte_flow {
        ILIST_ENTRY(uint32_t)next; /**< Index to the next flow structure. */
@@ -847,12 +962,14 @@ struct rte_flow {
        /**< Device flow handles that are part of the flow. */
        uint32_t drv_type:2; /**< Driver type. */
        uint32_t fdir:1; /**< Identifier of associated FDIR if any. */
+       uint32_t tunnel:1;
        uint32_t hairpin_flow_id:HAIRPIN_FLOW_ID_BITS;
        /**< The flow id used for hairpin. */
        uint32_t copy_applied:1; /**< The MARK copy Flow os applied. */
        uint32_t rix_mreg_copy;
        /**< Index to metadata register copy table resource. */
        uint32_t counter; /**< Holds flow counter. */
+       uint32_t tunnel_id;  /**< Tunnel id */
        uint16_t meter; /**< Holds flow meter id. */
 } __rte_packed;
 
@@ -935,9 +1052,54 @@ void mlx5_flow_id_pool_release(struct mlx5_flow_id_pool 
*pool);
 uint32_t mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id);
 uint32_t mlx5_flow_id_release(struct mlx5_flow_id_pool *pool,
                              uint32_t id);
-int mlx5_flow_group_to_table(const struct rte_flow_attr *attributes,
-                            bool external, uint32_t group, bool fdb_def_rule,
-                            uint32_t *table, struct rte_flow_error *error);
+__extension__
+struct flow_grp_info {
+       uint64_t external:1;
+       uint64_t transfer:1;
+       uint64_t fdb_def_rule:1;
+       /* force standard group translation */
+       uint64_t std_tbl_fix:1;
+};
+
+static inline bool
+tunnel_use_standard_attr_group_translate
+                   (struct rte_eth_dev *dev,
+                    const struct mlx5_flow_tunnel *tunnel,
+                    const struct rte_flow_attr *attr,
+                    const struct rte_flow_item items[],
+                    const struct rte_flow_action actions[])
+{
+       bool verdict;
+
+       if (!is_tunnel_offload_active(dev))
+               /* no tunnel offload API */
+               verdict = true;
+       else if (tunnel) {
+               /*
+                * OvS will use jump to group 0 in tunnel steer rule.
+                * If tunnel steer rule starts from group 0 (attr.group == 0)
+                * that 0 group must be traslated with standard method.
+                * attr.group == 0 in tunnel match rule translated with tunnel
+                * method
+                */
+               verdict = !attr->group &&
+                         is_flow_tunnel_steer_rule(dev, attr, items, actions);
+       } else {
+               /*
+                * non-tunnel group translation uses standard method for
+                * root group only: attr.group == 0
+                */
+               verdict = !attr->group;
+       }
+
+       return verdict;
+}
+
+int mlx5_flow_group_to_table(struct rte_eth_dev *dev,
+                            const struct mlx5_flow_tunnel *tunnel,
+                            uint32_t group, uint32_t *table,
+                            struct flow_grp_info flags,
+                                struct rte_flow_error *error);
 uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow_rss_desc *rss_desc,
                                     int tunnel, uint64_t layer_types,
                                     uint64_t hash_fields);
@@ -1069,4 +1231,9 @@ int mlx5_flow_destroy_policer_rules(struct rte_eth_dev 
*dev,
                                    const struct rte_flow_attr *attr);
 int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
                          struct rte_mtr_error *error);
+int mlx5_get_flow_tunnel(struct rte_eth_dev *dev,
+                        const struct rte_flow_tunnel *app_tunnel,
+                        struct mlx5_flow_tunnel **tunnel);
+void mlx5_release_tunnel_hub(struct mlx5_dev_ctx_shared *sh, uint16_t port_id);
+int mlx5_alloc_tunnel_hub(struct mlx5_dev_ctx_shared *sh);
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 79fdf34c0e..380fb0fb09 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -3702,14 +3702,21 @@ flow_dv_validate_action_modify_ttl(const uint64_t 
action_flags,
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-flow_dv_validate_action_jump(const struct rte_flow_action *action,
+flow_dv_validate_action_jump(struct rte_eth_dev *dev,
+                            const struct mlx5_flow_tunnel *tunnel,
+                            const struct rte_flow_action *action,
                             uint64_t action_flags,
                             const struct rte_flow_attr *attributes,
                             bool external, struct rte_flow_error *error)
 {
        uint32_t target_group, table;
        int ret = 0;
-
+       struct flow_grp_info grp_info = {
+               .external = !!external,
+               .transfer = !!attributes->transfer,
+               .fdb_def_rule = 1,
+               .std_tbl_fix = 0
+       };
        if (action_flags & (MLX5_FLOW_FATE_ACTIONS |
                            MLX5_FLOW_FATE_ESWITCH_ACTIONS))
                return rte_flow_error_set(error, EINVAL,
@@ -3726,11 +3733,13 @@ flow_dv_validate_action_jump(const struct 
rte_flow_action *action,
                                          NULL, "action configuration not set");
        target_group =
                ((const struct rte_flow_action_jump *)action->conf)->group;
-       ret = mlx5_flow_group_to_table(attributes, external, target_group,
-                                      true, &table, error);
+       ret = mlx5_flow_group_to_table(dev, tunnel, target_group, &table,
+                                      grp_info, error);
        if (ret)
                return ret;
-       if (attributes->group == target_group)
+       if (attributes->group == target_group &&
+           !(action_flags & (MLX5_FLOW_ACTION_TUNNEL_SET |
+                             MLX5_FLOW_ACTION_TUNNEL_MATCH)))
                return rte_flow_error_set(error, EINVAL,
                                          RTE_FLOW_ERROR_TYPE_ACTION, NULL,
                                          "target group must be other than"
@@ -4982,8 +4991,9 @@ flow_dv_counter_release(struct rte_eth_dev *dev, uint32_t 
counter)
  */
 static int
 flow_dv_validate_attributes(struct rte_eth_dev *dev,
+                           const struct mlx5_flow_tunnel *tunnel,
                            const struct rte_flow_attr *attributes,
-                           bool external __rte_unused,
+                           struct flow_grp_info grp_info,
                            struct rte_flow_error *error)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -4999,9 +5009,8 @@ flow_dv_validate_attributes(struct rte_eth_dev *dev,
 #else
        uint32_t table = 0;
 
-       ret = mlx5_flow_group_to_table(attributes, external,
-                                      attributes->group, !!priv->fdb_def_rule,
-                                      &table, error);
+       ret = mlx5_flow_group_to_table(dev, tunnel, attributes->group, &table,
+                                      grp_info, error);
        if (ret)
                return ret;
        if (!table)
@@ -5123,10 +5132,28 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct 
rte_flow_attr *attr,
        const struct rte_flow_item_vlan *vlan_m = NULL;
        int16_t rw_act_num = 0;
        uint64_t is_root;
+       const struct mlx5_flow_tunnel *tunnel;
+       struct flow_grp_info grp_info = {
+               .external = !!external,
+               .transfer = !!attr->transfer,
+               .fdb_def_rule = !!priv->fdb_def_rule,
+       };
 
        if (items == NULL)
                return -1;
-       ret = flow_dv_validate_attributes(dev, attr, external, error);
+       if (is_flow_tunnel_match_rule(dev, attr, items, actions)) {
+               tunnel = flow_items_to_tunnel(items);
+               action_flags |= MLX5_FLOW_ACTION_TUNNEL_MATCH |
+                               MLX5_FLOW_ACTION_DECAP;
+       } else if (is_flow_tunnel_steer_rule(dev, attr, items, actions)) {
+               tunnel = flow_actions_to_tunnel(actions);
+               action_flags |= MLX5_FLOW_ACTION_TUNNEL_SET;
+       } else {
+               tunnel = NULL;
+       }
+       grp_info.std_tbl_fix = tunnel_use_standard_attr_group_translate
+                               (dev, tunnel, attr, items, actions);
+       ret = flow_dv_validate_attributes(dev, tunnel, attr, grp_info, error);
        if (ret < 0)
                return ret;
        is_root = (uint64_t)ret;
@@ -5139,6 +5166,15 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct 
rte_flow_attr *attr,
                                                  RTE_FLOW_ERROR_TYPE_ITEM,
                                                  NULL, "item not supported");
                switch (type) {
+               case MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL:
+                       if (items[0].type != (typeof(items[0].type))
+                                               MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL)
+                               return rte_flow_error_set
+                                               (error, EINVAL,
+                                               RTE_FLOW_ERROR_TYPE_ITEM,
+                                               NULL, "MLX5 private items "
+                                               "must be the first");
+                       break;
                case RTE_FLOW_ITEM_TYPE_VOID:
                        break;
                case RTE_FLOW_ITEM_TYPE_PORT_ID:
@@ -5703,7 +5739,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct 
rte_flow_attr *attr,
                        rw_act_num += MLX5_ACT_NUM_MDF_TTL;
                        break;
                case RTE_FLOW_ACTION_TYPE_JUMP:
-                       ret = flow_dv_validate_action_jump(actions,
+                       ret = flow_dv_validate_action_jump(dev, tunnel, actions,
                                                           action_flags,
                                                           attr, external,
                                                           error);
@@ -5803,6 +5839,17 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct 
rte_flow_attr *attr,
                        action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
                        rw_act_num += MLX5_ACT_NUM_SET_DSCP;
                        break;
+               case MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET:
+                       if (actions[0].type != (typeof(actions[0].type))
+                               MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET)
+                               return rte_flow_error_set
+                                               (error, EINVAL,
+                                               RTE_FLOW_ERROR_TYPE_ACTION,
+                                               NULL, "MLX5 private action "
+                                               "must be the first");
+
+                       action_flags |= MLX5_FLOW_ACTION_TUNNEL_SET;
+                       break;
                default:
                        return rte_flow_error_set(error, ENOTSUP,
                                                  RTE_FLOW_ERROR_TYPE_ACTION,
@@ -5810,6 +5857,54 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct 
rte_flow_attr *attr,
                                                  "action not supported");
                }
        }
+       /*
+        * Validate actions in flow rules
+        * - Explicit decap action is prohibited by the tunnel offload API.
+        * - Drop action in tunnel steer rule is prohibited by the API.
+        * - Application cannot use MARK action because it's value can mask
+        *   tunnel default miss nitification.
+        * - JUMP in tunnel match rule has no support in current PMD
+        *   implementation.
+        * - TAG & META are reserved for future uses.
+        */
+       if (action_flags & MLX5_FLOW_ACTION_TUNNEL_SET) {
+               uint64_t bad_actions_mask = MLX5_FLOW_ACTION_DECAP    |
+                                           MLX5_FLOW_ACTION_MARK     |
+                                           MLX5_FLOW_ACTION_SET_TAG  |
+                                           MLX5_FLOW_ACTION_SET_META |
+                                           MLX5_FLOW_ACTION_DROP;
+
+               if (action_flags & bad_actions_mask)
+                       return rte_flow_error_set
+                                       (error, EINVAL,
+                                       RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+                                       "Invalid RTE action in tunnel "
+                                       "set decap rule");
+               if (!(action_flags & MLX5_FLOW_ACTION_JUMP))
+                       return rte_flow_error_set
+                                       (error, EINVAL,
+                                       RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+                                       "tunnel set decap rule must terminate "
+                                       "with JUMP");
+               if (!attr->ingress)
+                       return rte_flow_error_set
+                                       (error, EINVAL,
+                                       RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+                                       "tunnel flows for ingress traffic 
only");
+       }
+       if (action_flags & MLX5_FLOW_ACTION_TUNNEL_MATCH) {
+               uint64_t bad_actions_mask = MLX5_FLOW_ACTION_JUMP    |
+                                           MLX5_FLOW_ACTION_MARK    |
+                                           MLX5_FLOW_ACTION_SET_TAG |
+                                           MLX5_FLOW_ACTION_SET_META;
+
+               if (action_flags & bad_actions_mask)
+                       return rte_flow_error_set
+                                       (error, EINVAL,
+                                       RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+                                       "Invalid RTE action in tunnel "
+                                       "set match rule");
+       }
        /*
         * Validate the drop action mutual exclusion with other actions.
         * Drop action is mutually-exclusive with any other action, except for
@@ -7616,6 +7711,9 @@ static struct mlx5_flow_tbl_resource *
 flow_dv_tbl_resource_get(struct rte_eth_dev *dev,
                         uint32_t table_id, uint8_t egress,
                         uint8_t transfer,
+                        bool external,
+                        const struct mlx5_flow_tunnel *tunnel,
+                        uint32_t group_id,
                         struct rte_flow_error *error)
 {
        struct mlx5_priv *priv = dev->data->dev_private;
@@ -7652,6 +7750,9 @@ flow_dv_tbl_resource_get(struct rte_eth_dev *dev,
                return NULL;
        }
        tbl_data->idx = idx;
+       tbl_data->tunnel = tunnel;
+       tbl_data->group_id = group_id;
+       tbl_data->external = external;
        tbl = &tbl_data->tbl;
        pos = &tbl_data->entry;
        if (transfer)
@@ -7715,6 +7816,41 @@ flow_dv_tbl_resource_release(struct rte_eth_dev *dev,
 
                mlx5_flow_os_destroy_flow_tbl(tbl->obj);
                tbl->obj = NULL;
+               if (is_tunnel_offload_active(dev) && tbl_data->external) {
+                       struct mlx5_hlist_entry *he;
+                       struct mlx5_hlist *tunnel_grp_hash;
+                       struct mlx5_flow_tunnel_hub *thub =
+                                                       mlx5_tunnel_hub(dev);
+                       union tunnel_tbl_key tunnel_key = {
+                               .tunnel_id = tbl_data->tunnel ?
+                                               tbl_data->tunnel->tunnel_id : 0,
+                               .group = tbl_data->group_id
+                       };
+                       union mlx5_flow_tbl_key table_key = {
+                               .v64 = pos->key
+                       };
+                       uint32_t table_id = table_key.table_id;
+
+                       tunnel_grp_hash = tbl_data->tunnel ?
+                                               tbl_data->tunnel->groups :
+                                               thub->groups;
+                       he = mlx5_hlist_lookup(tunnel_grp_hash, tunnel_key.val);
+                       if (he) {
+                               struct tunnel_tbl_entry *tte;
+                               tte = container_of(he, typeof(*tte), hash);
+                               MLX5_ASSERT(tte->flow_table == table_id);
+                               mlx5_hlist_remove(tunnel_grp_hash, he);
+                               mlx5_free(tte);
+                       }
+                       mlx5_flow_id_release(mlx5_tunnel_hub(dev)->table_ids,
+                                            tunnel_flow_tbl_to_id(table_id));
+                       DRV_LOG(DEBUG,
+                               "port %u release table_id %#x tunnel %u group 
%u",
+                               dev->data->port_id, table_id,
+                               tbl_data->tunnel ?
+                               tbl_data->tunnel->tunnel_id : 0,
+                               tbl_data->group_id);
+               }
                /* remove the entry from the hash list and free memory. */
                mlx5_hlist_remove(sh->flow_tbls, pos);
                mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_JUMP],
@@ -7760,7 +7896,7 @@ flow_dv_matcher_register(struct rte_eth_dev *dev,
        int ret;
 
        tbl = flow_dv_tbl_resource_get(dev, key->table_id, key->direction,
-                                      key->domain, error);
+                                      key->domain, false, NULL, 0, error);
        if (!tbl)
                return -rte_errno;      /* No need to refill the error info */
        tbl_data = container_of(tbl, struct mlx5_flow_tbl_data_entry, tbl);
@@ -8215,11 +8351,23 @@ __flow_dv_translate(struct rte_eth_dev *dev,
        struct rte_vlan_hdr vlan = { 0 };
        uint32_t table;
        int ret = 0;
-
+       const struct mlx5_flow_tunnel *tunnel;
+       struct flow_grp_info grp_info = {
+               .external = !!dev_flow->external,
+               .transfer = !!attr->transfer,
+               .fdb_def_rule = !!priv->fdb_def_rule,
+       };
+       tunnel = is_flow_tunnel_match_rule(dev, attr, items, actions) ?
+                flow_items_to_tunnel(items) :
+                is_flow_tunnel_steer_rule(dev, attr, items, actions) ?
+                flow_actions_to_tunnel(actions) :
+                dev_flow->tunnel ? dev_flow->tunnel : NULL;
        mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
                                           MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
-       ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group,
-                                      !!priv->fdb_def_rule, &table, error);
+       grp_info.std_tbl_fix = tunnel_use_standard_attr_group_translate
+                               (dev, tunnel, attr, items, actions);
+       ret = mlx5_flow_group_to_table(dev, tunnel, attr->group, &table,
+                                      grp_info, error);
        if (ret)
                return ret;
        dev_flow->dv.group = table;
@@ -8229,6 +8377,45 @@ __flow_dv_translate(struct rte_eth_dev *dev,
                priority = dev_conf->flow_prio - 1;
        /* number of actions must be set to 0 in case of dirty stack. */
        mhdr_res->actions_num = 0;
+       if (is_flow_tunnel_match_rule(dev, attr, items, actions)) {
+               /*
+                * do not add decap action if match rule drops packet
+                * HW rejects rules with decap & drop
+                */
+               bool add_decap = true;
+               const struct rte_flow_action *ptr = actions;
+               struct mlx5_flow_tbl_resource *tbl;
+
+               for (; ptr->type != RTE_FLOW_ACTION_TYPE_END; ptr++) {
+                       if (ptr->type == RTE_FLOW_ACTION_TYPE_DROP) {
+                               add_decap = false;
+                               break;
+                       }
+               }
+               if (add_decap) {
+                       if (flow_dv_create_action_l2_decap(dev, dev_flow,
+                                                          attr->transfer,
+                                                          error))
+                               return -rte_errno;
+                       dev_flow->dv.actions[actions_n++] =
+                                       dev_flow->dv.encap_decap->action;
+                       action_flags |= MLX5_FLOW_ACTION_DECAP;
+               }
+               /*
+                * bind table_id with <group, table> for tunnel match rule.
+                * Tunnel set rule establishes that bind in JUMP action handler.
+                * Required for scenario when application creates tunnel match
+                * rule before tunnel set rule.
+                */
+               tbl = flow_dv_tbl_resource_get(dev, table, attr->egress,
+                                              attr->transfer,
+                                              !!dev_flow->external, tunnel,
+                                              attr->group, error);
+               if (!tbl)
+                       return rte_flow_error_set
+                              (error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION,
+                              actions, "cannot register tunnel group");
+       }
        for (; !actions_end ; actions++) {
                const struct rte_flow_action_queue *queue;
                const struct rte_flow_action_rss *rss;
@@ -8249,6 +8436,9 @@ __flow_dv_translate(struct rte_eth_dev *dev,
                                                  actions,
                                                  "action not supported");
                switch (action_type) {
+               case MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET:
+                       action_flags |= MLX5_FLOW_ACTION_TUNNEL_SET;
+                       break;
                case RTE_FLOW_ACTION_TYPE_VOID:
                        break;
                case RTE_FLOW_ACTION_TYPE_PORT_ID:
@@ -8480,16 +8670,19 @@ __flow_dv_translate(struct rte_eth_dev *dev,
                        action_flags |= MLX5_FLOW_ACTION_DECAP;
                        break;
                case RTE_FLOW_ACTION_TYPE_JUMP:
+                       grp_info.std_tbl_fix = 0;
                        jump_data = action->conf;
-                       ret = mlx5_flow_group_to_table(attr, dev_flow->external,
+                       ret = mlx5_flow_group_to_table(dev, tunnel,
                                                       jump_data->group,
-                                                      !!priv->fdb_def_rule,
-                                                      &table, error);
+                                                      &table,
+                                                      grp_info, error);
                        if (ret)
                                return ret;
-                       tbl = flow_dv_tbl_resource_get(dev, table,
-                                                      attr->egress,
-                                                      attr->transfer, error);
+                       tbl = flow_dv_tbl_resource_get(dev, table, attr->egress,
+                                                      attr->transfer,
+                                                      !!dev_flow->external,
+                                                      tunnel, jump_data->group,
+                                                      error);
                        if (!tbl)
                                return rte_flow_error_set
                                                (error, errno,
@@ -9681,7 +9874,8 @@ flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev,
                dtb = &mtb->ingress;
        /* Create the meter table with METER level. */
        dtb->tbl = flow_dv_tbl_resource_get(dev, MLX5_FLOW_TABLE_LEVEL_METER,
-                                           egress, transfer, &error);
+                                           egress, transfer, false, NULL, 0,
+                                           &error);
        if (!dtb->tbl) {
                DRV_LOG(ERR, "Failed to create meter policer table.");
                return -1;
@@ -9689,7 +9883,8 @@ flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev,
        /* Create the meter suffix table with SUFFIX level. */
        dtb->sfx_tbl = flow_dv_tbl_resource_get(dev,
                                            MLX5_FLOW_TABLE_LEVEL_SUFFIX,
-                                           egress, transfer, &error);
+                                           egress, transfer, false, NULL, 0,
+                                           &error);
        if (!dtb->sfx_tbl) {
                DRV_LOG(ERR, "Failed to create meter suffix table.");
                return -1;
-- 
2.28.0

Reply via email to