[dpdk-dev] [PATCH 0/8] ethdev: introduce GENEVE header TLV option item

2020-12-27 Thread Shiri Kuzin
The Geneve tunneling protocol is designed to allow the
user to specify some data context on the packet.
The GENEVE TLV (Type-Length-Variable) Option
is the mean intended to present the user data.

In order to support GENEVE TLV Option the new rte_flow
item "rte_flow_item_geneve_opt" is introduced.
The new item contains the values and masks for the
following fields:
-option class
-option type
-length
-data

The usage example:
"flow create 0 ingress pattern eth / ipv4 / udp / geneve vni is 100 /
geneve-opt class is 5 length is 1 type is 0 data is 0x66998800 /
end actions count / drop / end"


New item will be added to testpmd to support
raw encap/decap action.

Shiri Kuzin (7):
  lib/librte_ethdev: introduce GENEVE header TLV option item
  common/mlx5: check GENEVE TLV support in HCA attributes
  common/mlx5: create GENEVE TLV option object with DevX
  net/mlx5: create GENEVE TLV option management
  net/mlx5: add GENEVE TLV option flow validation
  net/mlx5: add GENEVE TLV option flow translation
  doc: update GENEVE TLV option support

Viacheslav Ovsiienko (1):
  app/testpmd: add GENEVE option item support

 app/test-pmd/cmdline_flow.c | 102 ++-
 doc/guides/nics/mlx5.rst|  18 ++-
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 ++
 drivers/common/mlx5/mlx5_devx_cmds.c|  61 +
 drivers/common/mlx5/mlx5_devx_cmds.h|   9 ++
 drivers/common/mlx5/mlx5_prm.h  |  28 -
 drivers/common/mlx5/version.map |   1 +
 drivers/net/mlx5/mlx5.c |   2 +
 drivers/net/mlx5/mlx5.h |  13 ++
 drivers/net/mlx5/mlx5_flow.c| 120 ++
 drivers/net/mlx5/mlx5_flow.h|  11 ++
 drivers/net/mlx5/mlx5_flow_dv.c | 188 +++-
 lib/librte_ethdev/rte_flow.c|   1 +
 lib/librte_ethdev/rte_flow.h|  33 +
 14 files changed, 584 insertions(+), 11 deletions(-)

-- 
1.8.3.1



[dpdk-dev] [PATCH 1/8] lib/librte_ethdev: introduce GENEVE header TLV option item

2020-12-27 Thread Shiri Kuzin
The Geneve tunneling protocol is designed to allow the
user to specify some data context on the packet.
The GENEVE TLV (Type-Length-Variable) Option
is the mean intended to present the user data.

In order to support GENEVE TLV Option the new rte_flow
item "rte_flow_item_geneve_opt" is added.
The new item contains the values and masks for the
following fields:
-option class
-option type
-length
-data

New item will be added to testpmd to support match and
raw encap/decap actions.

Signed-off-by: Shiri Kuzin 
---
 lib/librte_ethdev/rte_flow.c |  1 +
 lib/librte_ethdev/rte_flow.h | 33 +
 2 files changed, 34 insertions(+)

diff --git a/lib/librte_ethdev/rte_flow.c b/lib/librte_ethdev/rte_flow.c
index a06f64c..2af7d96 100644
--- a/lib/librte_ethdev/rte_flow.c
+++ b/lib/librte_ethdev/rte_flow.c
@@ -97,6 +97,7 @@ struct rte_flow_desc_data {
MK_FLOW_ITEM(L2TPV3OIP, sizeof(struct rte_flow_item_l2tpv3oip)),
MK_FLOW_ITEM(PFCP, sizeof(struct rte_flow_item_pfcp)),
MK_FLOW_ITEM(ECPRI, sizeof(struct rte_flow_item_ecpri)),
+   MK_FLOW_ITEM(GENEVE_OPT, sizeof(struct rte_flow_item_geneve_opt)),
 };
 
 /** Generate flow_action[] entry. */
diff --git a/lib/librte_ethdev/rte_flow.h b/lib/librte_ethdev/rte_flow.h
index 0977a78..e17a630 100644
--- a/lib/librte_ethdev/rte_flow.h
+++ b/lib/librte_ethdev/rte_flow.h
@@ -543,6 +543,14 @@ enum rte_flow_item_type {
 * See struct rte_flow_item_ipv6_frag_ext.
 */
RTE_FLOW_ITEM_TYPE_IPV6_FRAG_EXT,
+
+   /**
+* Matches Geneve Variable Length Option
+*
+* See struct rte_flow_item_geneve_opt
+*/
+   RTE_FLOW_ITEM_TYPE_GENEVE_OPT,
+
 };
 
 /**
@@ -1626,7 +1634,32 @@ struct rte_flow_item_ecpri {
},
 };
 #endif
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+/**
+ * RTE_FLOW_ITEM_TYPE_GENEVE_OPT
+ *
+ * Matches a GENEVE Variable Length Option
+ */
+RTE_STD_C11
+struct rte_flow_item_geneve_opt {
+   rte_be16_t option_class;
+   uint8_t option_type;
+   uint8_t option_len;
+   uint32_t *data;
+};
+#ifdef PEDANTIC
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
 
+/** Default mask for RTE_FLOW_ITEM_TYPE_GENEVE_OPT. */
+#ifndef __cplusplus
+static const struct rte_flow_item_geneve_opt
+rte_flow_item_geneve_opt_mask = {
+   .option_type = 0xff,
+};
+#endif
 /**
  * Matching pattern item definition.
  *
-- 
1.8.3.1



[dpdk-dev] [PATCH 3/8] common/mlx5: check GENEVE TLV support in HCA attributes

2020-12-27 Thread Shiri Kuzin
This is preparation step to support match on GENEVE TLV option.

In this Patch we add the HCA attributes that will allow
supporting GENEVE TLV option matching.

Signed-off-by: Shiri Kuzin 
---
 drivers/common/mlx5/mlx5_devx_cmds.c |  7 +++
 drivers/common/mlx5/mlx5_devx_cmds.h |  4 
 drivers/common/mlx5/mlx5_prm.h   | 28 +---
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c 
b/drivers/common/mlx5/mlx5_devx_cmds.c
index 9c1d188..a6d052d 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -693,6 +693,10 @@ struct mlx5_devx_obj *
attr->eth_virt = MLX5_GET(cmd_hca_cap, hcattr, eth_virt);
attr->flex_parser_protocols = MLX5_GET(cmd_hca_cap, hcattr,
   flex_parser_protocols);
+   attr->max_geneve_tlv_options = MLX5_GET(cmd_hca_cap, hcattr,
+   max_geneve_tlv_options);
+   attr->max_geneve_tlv_option_data_len = MLX5_GET(cmd_hca_cap, hcattr,
+   max_geneve_tlv_option_data_len);
attr->qos.sup = MLX5_GET(cmd_hca_cap, hcattr, qos);
attr->vdpa.valid = !!(MLX5_GET64(cmd_hca_cap, hcattr,
 general_obj_types) &
@@ -720,6 +724,9 @@ struct mlx5_devx_obj *
attr->flow_hit_aso = !!(MLX5_GET64(cmd_hca_cap, hcattr,
   general_obj_types) &
MLX5_GENERAL_OBJ_TYPES_CAP_FLOW_HIT_ASO);
+   attr->geneve_tlv_opt = !!(MLX5_GET64(cmd_hca_cap, hcattr,
+  general_obj_types) &
+   MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT);
if (attr->qos.sup) {
MLX5_SET(query_hca_cap_in, in, op_mod,
 MLX5_GET_HCA_CAP_OP_MOD_QOS_CAP |
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h 
b/drivers/common/mlx5/mlx5_devx_cmds.h
index 726e9f5..58e619f 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -96,6 +96,8 @@ struct mlx5_hca_attr {
uint32_t lro_timer_supported_periods[MLX5_LRO_NUM_SUPP_PERIODS];
uint16_t lro_min_mss_size;
uint32_t flex_parser_protocols;
+   uint32_t max_geneve_tlv_options;
+   uint32_t max_geneve_tlv_option_data_len;
uint32_t hairpin:1;
uint32_t log_max_hairpin_queues:5;
uint32_t log_max_hairpin_wq_data_sz:5;
@@ -115,6 +117,7 @@ struct mlx5_hca_attr {
uint32_t regex:1;
uint32_t regexp_num_of_engines;
uint32_t log_max_ft_sampler_num:8;
+   uint32_t geneve_tlv_opt;
struct mlx5_hca_qos_attr qos;
struct mlx5_hca_vdpa_attr vdpa;
 };
@@ -469,6 +472,7 @@ struct mlx5_devx_obj *mlx5_devx_cmd_create_flex_parser(void 
*ctx,
 __rte_internal
 int mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id,
uint32_t arg, uint32_t *data, uint32_t dw_cnt);
+
 /**
  * Create virtio queue counters object DevX API.
  *
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 58d1804..c4fa395 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -787,7 +787,7 @@ struct mlx5_ifc_fte_match_set_misc3_bits {
u8 icmp_code[0x8];
u8 icmpv6_type[0x8];
u8 icmpv6_code[0x8];
-   u8 reserved_at_120[0x20];
+   u8 geneve_tlv_option_0_data[0x20];
u8 gtpu_teid[0x20];
u8 gtpu_msg_type[0x08];
u8 gtpu_msg_flags[0x08];
@@ -1065,6 +1065,8 @@ enum {
(1ULL << MLX5_GENERAL_OBJ_TYPE_FLEX_PARSE_GRAPH)
 #define MLX5_GENERAL_OBJ_TYPES_CAP_FLOW_HIT_ASO \
(1ULL << MLX5_GENERAL_OBJ_TYPE_FLOW_HIT_ASO)
+#define MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT \
+   (1ULL << MLX5_OBJ_TYPE_GENEVE_TLV_OPT)
 
 enum {
MLX5_HCA_CAP_OPMOD_GET_MAX   = 0,
@@ -1363,8 +1365,10 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_500[0x20];
u8 num_of_uars_per_page[0x20];
u8 flex_parser_protocols[0x20];
-   u8 reserved_at_560[0x20];
-   u8 reserved_at_580[0x3c];
+   u8 max_geneve_tlv_options[0x8];
+   u8 reserved_at_568[0x3];
+   u8 max_geneve_tlv_option_data_len[0x5];
+   u8 reserved_at_570[0x4c];
u8 mini_cqe_resp_stride_index[0x1];
u8 cqe_128_always[0x1];
u8 cqe_compression_128[0x1];
@@ -2232,6 +2236,7 @@ struct mlx5_ifc_create_cq_in_bits {
 };
 
 enum {
+   MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
MLX5_GENERAL_OBJ_TYPE_VIRTQ = 0x000d,
MLX5_GENERAL_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c,
MLX5_GENERAL_OBJ_TYPE_FLEX_PARSE_GRAPH = 0x0022,
@@ -2266,6 +2271,17 @@ struct mlx5_ifc_virtio_q_counters_bits {
u8 reserved_at_180[0x50];
 };
 
+struct mlx5_ifc_geneve_tlv_option_bits {
+   u8 modify_field_select[0x40];
+   u8 reserved_at_40[0x18];
+   u8 geneve_option_fte_i

[dpdk-dev] [PATCH 2/8] app/testpmd: add GENEVE option item support

2020-12-27 Thread Shiri Kuzin
From: Viacheslav Ovsiienko 

The patch adds the GENEVE option rte flow item support to
command line interpreter. The flow command with GENEVE
option items looks like:

  flow create 0 ingress pattern eth / ipv4 / udp / geneve vni is 100 /
   geneve-opt class is 99 length is 1 type is 0 data is 0x669988 /
   end actions drop / end

The option length should be specified in 32-bit words, this
value specifies the length of the data pattern/mask arrays (should be
multiplied by sizeof(uint32_t) to be expressed in bytes. If match
on the length itself is not needed the mask should be set to zero, in
this case length is used to specify the pattern/mask array lengths only.

Signed-off-by: Viacheslav Ovsiienko 
---
 app/test-pmd/cmdline_flow.c | 102 ++--
 doc/guides/testpmd_app_ug/testpmd_funcs.rst |   8 +++
 2 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c
index 585cab9..8bb2cb1 100644
--- a/app/test-pmd/cmdline_flow.c
+++ b/app/test-pmd/cmdline_flow.c
@@ -283,6 +283,11 @@ enum index {
ITEM_ECPRI_MSG_IQ_DATA_PCID,
ITEM_ECPRI_MSG_RTC_CTRL_RTCID,
ITEM_ECPRI_MSG_DLY_MSR_MSRID,
+   ITEM_GENEVE_OPT,
+   ITEM_GENEVE_OPT_CLASS,
+   ITEM_GENEVE_OPT_TYPE,
+   ITEM_GENEVE_OPT_LENGTH,
+   ITEM_GENEVE_OPT_DATA,
 
/* Validate/create actions. */
ACTIONS,
@@ -413,6 +418,9 @@ enum index {
 /** Maximum size for pattern in struct rte_flow_item_raw. */
 #define ITEM_RAW_PATTERN_SIZE 40
 
+/** Maximum size for GENEVE option data pattern in bytes. */
+#define ITEM_GENEVE_OPT_DATA_SIZE 124
+
 /** Storage size for struct rte_flow_item_raw including pattern. */
 #define ITEM_RAW_SIZE \
(sizeof(struct rte_flow_item_raw) + ITEM_RAW_PATTERN_SIZE)
@@ -428,7 +436,7 @@ struct action_rss_data {
 };
 
 /** Maximum data size in struct rte_flow_action_raw_encap. */
-#define ACTION_RAW_ENCAP_MAX_DATA 128
+#define ACTION_RAW_ENCAP_MAX_DATA 512
 #define RAW_ENCAP_CONFS_MAX_NUM 8
 
 /** Storage for struct rte_flow_action_raw_encap. */
@@ -658,6 +666,16 @@ struct token {
.mask = (const void *)&(const s){ .f = (1 << (b)) - 1 }, \
})
 
+/** Static initializer for ARGS() to target a field with limits. */
+#define ARGS_ENTRY_BOUNDED(s, f, i, a) \
+   (&(const struct arg){ \
+   .bounded = 1, \
+   .min = (i), \
+   .max = (a), \
+   .offset = offsetof(s, f), \
+   .size = sizeof(((s *)0)->f), \
+   })
+
 /** Static initializer for ARGS() to target an arbitrary bit-mask. */
 #define ARGS_ENTRY_MASK(s, f, m) \
(&(const struct arg){ \
@@ -903,6 +921,7 @@ struct parse_action_priv {
ITEM_AH,
ITEM_PFCP,
ITEM_ECPRI,
+   ITEM_GENEVE_OPT,
END_SET,
ZERO,
 };
@@ -1244,6 +1263,15 @@ struct parse_action_priv {
ZERO,
 };
 
+static const enum index item_geneve_opt[] = {
+   ITEM_GENEVE_OPT_CLASS,
+   ITEM_GENEVE_OPT_TYPE,
+   ITEM_GENEVE_OPT_LENGTH,
+   ITEM_GENEVE_OPT_DATA,
+   ITEM_NEXT,
+   ZERO,
+};
+
 static const enum index next_action[] = {
ACTION_END,
ACTION_VOID,
@@ -3230,6 +3258,47 @@ static int comp_set_sample_index(struct context *, const 
struct token *,
.args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_ecpri,
hdr.type5.msr_id)),
},
+   [ITEM_GENEVE_OPT] = {
+   .name = "geneve-opt",
+   .help = "GENEVE header option",
+   .priv = PRIV_ITEM(GENEVE_OPT,
+ sizeof(struct rte_flow_item_geneve_opt) +
+ ITEM_GENEVE_OPT_DATA_SIZE),
+   .next = NEXT(item_geneve_opt),
+   .call = parse_vc,
+   },
+   [ITEM_GENEVE_OPT_CLASS] = {
+   .name = "class",
+   .help = "GENEVE option class",
+   .next = NEXT(item_geneve_opt, NEXT_ENTRY(UNSIGNED), item_param),
+   .args = ARGS(ARGS_ENTRY_HTON(struct rte_flow_item_geneve_opt,
+option_class)),
+   },
+   [ITEM_GENEVE_OPT_TYPE] = {
+   .name = "type",
+   .help = "GENEVE option type",
+   .next = NEXT(item_geneve_opt, NEXT_ENTRY(UNSIGNED), item_param),
+   .args = ARGS(ARGS_ENTRY(struct rte_flow_item_geneve_opt,
+   option_type)),
+   },
+   [ITEM_GENEVE_OPT_LENGTH] = {
+   .name = "length",
+   .help = "GENEVE option data length (in 32b words)",
+   .next = NEXT(item_geneve_opt, NEXT_ENTRY(UNSIGNED), item_param),
+   .args = ARGS(ARGS_ENTRY_BOUNDED(
+   struct rte_flow_item_geneve_opt, option_len,
+   0, 31)),
+   },
+   [ITEM_GENEVE_OPT_DATA] = {
+   .name 

[dpdk-dev] [PATCH 5/8] net/mlx5: create GENEVE TLV option management

2020-12-27 Thread Shiri Kuzin
Currently firmware supports the only TLV object per device
to match on the GENEVE header option.

This patch adds the simple TLV object management to the mlx5 PMD.

Signed-off-by: Shiri Kuzin 
---
 drivers/net/mlx5/mlx5.c |   2 +
 drivers/net/mlx5/mlx5.h |  13 +
 drivers/net/mlx5/mlx5_flow.h|   4 ++
 drivers/net/mlx5/mlx5_flow_dv.c | 108 
 4 files changed, 127 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 52a8a25..4ed3730 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1013,6 +1013,7 @@ struct mlx5_dev_ctx_shared *
rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
/* Add context to the global device list. */
LIST_INSERT_HEAD(&mlx5_dev_ctx_list, sh, next);
+   rte_spinlock_init(&sh->geneve_tlv_opt_sl);
 exit:
pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
return sh;
@@ -1108,6 +1109,7 @@ struct mlx5_dev_ctx_shared *
mlx5_glue->devx_free_uar(sh->devx_rx_uar);
if (sh->ctx)
claim_zero(mlx5_glue->close_device(sh->ctx));
+   MLX5_ASSERT(sh->geneve_tlv_option_resource == NULL);
pthread_mutex_destroy(&sh->txpp.mutex);
mlx5_free(sh);
return;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 121d726..923f7cb 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -535,6 +535,16 @@ struct mlx5_aso_age_mng {
struct mlx5_aso_sq aso_sq; /* ASO queue objects. */
 };
 
+/* Management structure for geneve tlv option */
+struct mlx5_geneve_tlv_option_resource {
+   struct mlx5_devx_obj *obj; /* Pointer to the geneve tlv opt object. */
+   rte_be16_t option_class; /* geneve tlv opt class.*/
+   uint8_t option_type; /* geneve tlv opt type.*/
+   uint8_t length; /* geneve tlv opt length. */
+   uint32_t refcnt; /* geneve tlv object reference counter */
+};
+
+
 #define MLX5_AGE_EVENT_NEW 1
 #define MLX5_AGE_TRIGGER   2
 #define MLX5_AGE_SET(age_info, BIT) \
@@ -747,6 +757,9 @@ struct mlx5_dev_ctx_shared {
void *devx_rx_uar; /* DevX UAR for Rx. */
struct mlx5_aso_age_mng *aso_age_mng;
/* Management data for aging mechanism using ASO Flow Hit. */
+   struct mlx5_geneve_tlv_option_resource *geneve_tlv_option_resource;
+   /* Management structure for geneve tlv option */
+   rte_spinlock_t geneve_tlv_opt_sl; /* Lock for geneve tlv resource */
struct mlx5_dev_shared_port port[]; /* per device port data array. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index d85dd19..d8a6688 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -1048,6 +1048,7 @@ struct rte_flow {
uint32_t counter; /**< Holds flow counter. */
uint32_t tunnel_id;  /**< Tunnel id */
uint32_t age; /**< Holds ASO age bit index. */
+   uint32_t geneve_tlv_option; /**< Holds Geneve TLV option id. > */
 } __rte_packed;
 
 /*
@@ -1505,4 +1506,7 @@ void flow_dv_dest_array_remove_cb(struct mlx5_cache_list 
*list,
  struct mlx5_cache_entry *entry);
 struct mlx5_aso_age_action *flow_aso_age_get_by_idx(struct rte_eth_dev *dev,
uint32_t age_idx);
+int flow_dev_geneve_tlv_option_resource_register(struct rte_eth_dev *dev,
+const struct rte_flow_item *item,
+struct rte_flow_error *error);
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 4f638e2..3dcb87a 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -7224,6 +7224,90 @@ struct mlx5_hlist_entry *
 }
 
 /**
+ * Create Geneve TLV option resource.
+ *
+ * @param dev[in, out]
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] tag_be24
+ *   Tag value in big endian then R-shift 8.
+ * @parm[in, out] dev_flow
+ *   Pointer to the dev_flow.
+ * @param[out] error
+ *   pointer to error structure.
+ *
+ * @return
+ *   0 on success otherwise -errno and errno is set.
+ */
+
+int
+flow_dev_geneve_tlv_option_resource_register(struct rte_eth_dev *dev,
+const struct rte_flow_item *item,
+struct rte_flow_error *error)
+{
+   struct mlx5_priv *priv = dev->data->dev_private;
+   struct mlx5_dev_ctx_shared *sh = priv->sh;
+   struct mlx5_geneve_tlv_option_resource *geneve_opt_resource =
+   sh->geneve_tlv_option_resource;
+   struct mlx5_devx_obj *obj;
+   const struct rte_flow_item_geneve_opt *geneve_opt_v = item->spec;
+   int ret = 0;
+
+   if (!geneve_opt_v)
+   return -1;
+   rte_spinlock_lock(&sh->geneve_tlv_opt_sl);
+   if (geneve_opt_resource 

[dpdk-dev] [PATCH 4/8] common/mlx5: create GENEVE TLV option object with DevX

2020-12-27 Thread Shiri Kuzin
TLV object is a special firmware maintained entity used
to support match on GENEVE header extension option.

The TLV object is created with DevX API and accepts
the option class, type and lehgth fields.

The class type and length fields are set using MLX5_SET
and the Devx object is created using mlx5 glue function.

Signed-off-by: Shiri Kuzin 
---
 drivers/common/mlx5/mlx5_devx_cmds.c | 54 
 drivers/common/mlx5/mlx5_devx_cmds.h |  5 
 drivers/common/mlx5/version.map  |  1 +
 3 files changed, 60 insertions(+)

diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c 
b/drivers/common/mlx5/mlx5_devx_cmds.c
index a6d052d..b5808a9 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -2051,3 +2051,57 @@ struct mlx5_devx_obj *
flow_hit_aso_obj->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
return flow_hit_aso_obj;
 }
+
+/**
+ * Create general object of type GENEVE TLV option using DevX API.
+ *
+ * @param[in] ctx
+ *   Context returned from mlx5 open_device() glue function.
+ * @param [in] class
+ *   TLV option variable value of class
+ * @param [in] type
+ *   TLV option variable value of type
+ * @param [in] len
+ *   TLV option variable value of len
+ *
+ * @return
+ *   The DevX object created, NULL otherwise and rte_errno is set.
+ */
+struct mlx5_devx_obj *
+mlx5_devx_cmd_create_geneve_tlv_option(void *ctx,
+   uint16_t class, uint8_t type, uint8_t len)
+{
+   uint32_t in[MLX5_ST_SZ_DW(create_geneve_tlv_option_in)] = {0};
+   uint32_t out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0};
+   struct mlx5_devx_obj *geneve_tlv_opt_obj = mlx5_malloc(MLX5_MEM_ZERO,
+  sizeof(*geneve_tlv_opt_obj),
+  0, SOCKET_ID_ANY);
+
+   if (!geneve_tlv_opt_obj) {
+   DRV_LOG(ERR, "Failed to allocate geneve tlv option object.");
+   rte_errno = ENOMEM;
+   return NULL;
+   }
+   void *hdr = MLX5_ADDR_OF(create_geneve_tlv_option_in, in, hdr);
+   void *opt = MLX5_ADDR_OF(create_geneve_tlv_option_in, in,
+   geneve_tlv_opt);
+   MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode,
+   MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+   MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type,
+   MLX5_OBJ_TYPE_GENEVE_TLV_OPT);
+   MLX5_SET(geneve_tlv_option, opt, option_class,
+   rte_be_to_cpu_16(class));
+   MLX5_SET(geneve_tlv_option, opt, option_type, type);
+   MLX5_SET(geneve_tlv_option, opt, option_data_length, len);
+   geneve_tlv_opt_obj->obj = mlx5_glue->devx_obj_create(ctx, in,
+   sizeof(in), out, sizeof(out));
+   if (!geneve_tlv_opt_obj->obj) {
+   rte_errno = errno;
+   DRV_LOG(ERR, "Failed to create Geneve tlv option "
+   "Obj using DevX.");
+   mlx5_free(geneve_tlv_opt_obj);
+   return NULL;
+   }
+   geneve_tlv_opt_obj->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+   return geneve_tlv_opt_obj;
+}
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h 
b/drivers/common/mlx5/mlx5_devx_cmds.h
index 58e619f..73c1dfd 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -473,6 +473,11 @@ struct mlx5_devx_obj 
*mlx5_devx_cmd_create_flex_parser(void *ctx,
 int mlx5_devx_cmd_register_read(void *ctx, uint16_t reg_id,
uint32_t arg, uint32_t *data, uint32_t dw_cnt);
 
+__rte_internal
+struct mlx5_devx_obj *
+mlx5_devx_cmd_create_geneve_tlv_option(void *ctx,
+   uint16_t class, uint8_t type, uint8_t len);
+
 /**
  * Create virtio queue counters object DevX API.
  *
diff --git a/drivers/common/mlx5/version.map b/drivers/common/mlx5/version.map
index 17dd11f..3c403d2 100644
--- a/drivers/common/mlx5/version.map
+++ b/drivers/common/mlx5/version.map
@@ -22,6 +22,7 @@ INTERNAL {
mlx5_devx_cmd_create_virtio_q_counters;
mlx5_devx_cmd_create_virtq;
 mlx5_devx_cmd_create_flow_hit_aso_obj;
+   mlx5_devx_cmd_create_geneve_tlv_option;
mlx5_devx_cmd_destroy;
mlx5_devx_cmd_flow_counter_alloc;
mlx5_devx_cmd_flow_counter_query;
-- 
1.8.3.1



[dpdk-dev] [PATCH 6/8] net/mlx5: add GENEVE TLV option flow validation

2020-12-27 Thread Shiri Kuzin
This patch adds validation routine for the GENEVE
header TLV option.

The GENEVE TLV option match must include all fields
with full masks due to NIC does not support masking
on option class, type and length.

The option data length must be non zero and provided
data pattern should be zero neither due to hardware
limitations.

Signed-off-by: Shiri Kuzin 
---
 drivers/net/mlx5/mlx5_flow.c| 120 
 drivers/net/mlx5/mlx5_flow.h|   7 +++
 drivers/net/mlx5/mlx5_flow_dv.c |  10 +++-
 3 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 82e24d7..eaf777b 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -2603,6 +2603,126 @@ struct mlx5_flow_tunnel_info {
 }
 
 /**
+ * Validate Geneve TLV option item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] last_item
+ *   Previous validated item in the pattern items.
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_validate_item_geneve_opt(const struct rte_flow_item *item,
+  uint64_t last_item,
+  struct rte_eth_dev *dev,
+  struct rte_flow_error *error)
+{
+   struct mlx5_priv *priv = dev->data->dev_private;
+   struct mlx5_dev_ctx_shared *sh = priv->sh;
+   struct mlx5_geneve_tlv_option_resource *geneve_opt_resource;
+   struct mlx5_hca_attr *hca_attr = &priv->config.hca_attr;
+   uint8_t data_max_supported =
+   hca_attr->max_geneve_tlv_option_data_len * 4;
+   struct mlx5_dev_config *config = &priv->config;
+   const struct rte_flow_item_geneve_opt *spec = item->spec;
+   const struct rte_flow_item_geneve_opt *mask = item->mask;
+   unsigned int i;
+   unsigned int data_len;
+   const struct rte_flow_item_geneve_opt full_mask = {
+   .option_class = RTE_BE16(0x),
+   .option_type = 0xff,
+   .option_len = 0x1f,
+   };
+
+   if (!mask)
+   mask = &rte_flow_item_geneve_opt_mask;
+   if (!spec)
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve TLV opt class/type/length must be specified");
+   if ((uint32_t)(spec->option_len) > MLX5_GENEVE_OPTLEN_MASK)
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve TLV opt length exceeeds the limit (31)");
+   /* Check if class type and length masks are full. */
+   if (full_mask.option_class != mask->option_class ||
+   full_mask.option_type != mask->option_type ||
+   full_mask.option_len != (mask->option_len & full_mask.option_len))
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve TLV opt class/type/length masks must be full");
+   /* Check if length is supported */
+   if ((uint32_t)(spec->option_len) >
+   config->hca_attr.max_geneve_tlv_option_data_len)
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve TLV opt length not supported");
+   if (config->hca_attr.max_geneve_tlv_options > 1)
+   DRV_LOG(DEBUG,
+   "max_geneve_tlv_options supports more than 1 option");
+   /* Check GENEVE item preceding. */
+   if (!(last_item & MLX5_FLOW_LAYER_GENEVE))
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve opt item must be preceded with Geneve item");
+   /* Check if length is 0 or data is 0. */
+   if (spec->data == NULL || spec->option_len == 0)
+   return rte_flow_error_set
+   (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Geneve TLV opt with zero data/length not supported");
+   /* Check not all data & mask are 0. */
+   data_len = spec->option_len * 4;
+   if (mask->data == NULL) {
+   for (i = 0; i < data_len; i++)
+   if (spec->data[i])
+   break;
+   if (i == data_len)
+   return rte_flow_error_set(error, ENOTSUP,
+   RTE_FLOW_ERROR_TYPE_ITEM, item,
+   "Can't match on Geneve option data 0");
+   } else {
+   for (i = 0; i < data_len; i++)
+   if (spec->data[i] & mask->data[i])
+   break;
+   if

[dpdk-dev] [PATCH 7/8] net/mlx5: add GENEVE TLV option flow translation

2020-12-27 Thread Shiri Kuzin
The GENEVE TLV option matching flows must be created
using a translation function.

This function checks whether we already created a Devx
object for the matching and either creates the objects
or updates the reference counter.

Signed-off-by: Shiri Kuzin 
---
 drivers/net/mlx5/mlx5_flow_dv.c | 70 +
 1 file changed, 70 insertions(+)

diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 6db2789..f78622d 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -7316,6 +7316,65 @@ struct mlx5_hlist_entry *
 }
 
 /**
+ * Add Geneve TLV option item to matcher.
+ *
+ * @param[in, out] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[out] error
+ *   Pointer to error structure.
+ */
+static int
+flow_dv_translate_item_geneve_opt(struct rte_eth_dev *dev, void *matcher,
+ void *key, const struct rte_flow_item *item,
+ struct rte_flow_error *error)
+{
+   const struct rte_flow_item_geneve_opt *geneve_opt_m = item->mask;
+   const struct rte_flow_item_geneve_opt *geneve_opt_v = item->spec;
+   void *misc3_m = MLX5_ADDR_OF(fte_match_param, matcher,
+   misc_parameters_3);
+   void *misc3_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3);
+   rte_be32_t opt_data_key = 0, opt_data_mask = 0;
+   int ret = 0;
+
+   if (!geneve_opt_v)
+   return -1;
+   if (!geneve_opt_m)
+   geneve_opt_m = &rte_flow_item_geneve_opt_mask;
+   ret = flow_dev_geneve_tlv_option_resource_register(dev, item,
+  error);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to create geneve_tlv_obj");
+   return ret;
+   }
+   /* Set the data. */
+   if (geneve_opt_v->data) {
+   memcpy(&opt_data_key, geneve_opt_v->data,
+   RTE_MIN((uint32_t)(geneve_opt_v->option_len * 4),
+   sizeof(opt_data_key)));
+   MLX5_ASSERT((uint32_t)(geneve_opt_v->option_len * 4) <=
+   sizeof(opt_data_key));
+   memcpy(&opt_data_mask, geneve_opt_m->data,
+   RTE_MIN((uint32_t)(geneve_opt_v->option_len * 4),
+   sizeof(opt_data_mask)));
+   MLX5_ASSERT((uint32_t)(geneve_opt_v->option_len * 4) <=
+   sizeof(opt_data_mask));
+   MLX5_SET(fte_match_set_misc3, misc3_m,
+   geneve_tlv_option_0_data,
+   rte_be_to_cpu_32(opt_data_mask));
+   MLX5_SET(fte_match_set_misc3, misc3_v,
+   geneve_tlv_option_0_data,
+   rte_be_to_cpu_32(opt_data_key & opt_data_mask));
+   }
+   return ret;
+}
+
+/**
  * Add MPLS item to matcher and to the value.
  *
  * @param[in, out] matcher
@@ -10559,6 +10618,17 @@ struct mlx5_cache_entry *
matcher.priority = MLX5_TUNNEL_PRIO_GET(rss_desc);
last_item = MLX5_FLOW_LAYER_GENEVE;
break;
+   case RTE_FLOW_ITEM_TYPE_GENEVE_OPT:
+   ret = flow_dv_translate_item_geneve_opt(dev, match_mask,
+ match_value,
+ items, error);
+   if (ret)
+   return rte_flow_error_set(error, -ret,
+   RTE_FLOW_ERROR_TYPE_ITEM, NULL,
+   "cannot create GENEVE TLV option");
+   flow->geneve_tlv_option = 1;
+   last_item = MLX5_FLOW_LAYER_GENEVE_OPT;
+   break;
case RTE_FLOW_ITEM_TYPE_MPLS:
flow_dv_translate_item_mpls(match_mask, match_value,
items, last_item, tunnel);
-- 
1.8.3.1



[dpdk-dev] [PATCH 8/8] doc: update GENEVE TLV option support

2020-12-27 Thread Shiri Kuzin
GENEVE TLV option support added to mlx5 PMD.

The limitations and support were updated in
documentation.

Signed-off-by: Shiri Kuzin 
---
 doc/guides/nics/mlx5.rst | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 3bda0f8..9700fe5 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -98,6 +98,7 @@ Features
 - Hardware LRO.
 - Hairpin.
 - Multiple-thread flow insertion.
+- Matching on Geneve TLV option header with raw encap/decap action.
 
 Limitations
 ---
@@ -175,7 +176,18 @@ Limitations
  - OAM
  - protocol type
  - options length
-   Currently, the only supported options length value is 0.
+
+- Match on Geneve TLv option is supported on the following fields:
+ - Class
+ - Type
+ - Length
+ - Data
+
+  Only one Class/Type/Length Geneve TLV option is supported per shared device.
+  Class/Type/Length fields must be specified as well as masks.
+  Class/Type/Length specified masks must be full.
+  Matching Geneve TLV option without specifying data is not supported.
+  Matching Geneve TLV option with data & mask == 0 is not supported.
 
 - VF: flow rules created on VF devices can only match traffic targeted at the
   configured MAC addresses (see ``rte_eth_dev_mac_addr_add()``).
@@ -1022,6 +1034,10 @@ Below are some firmware configurations listed.
or
FLEX_PARSER_PROFILE_ENABLE=1
 
+- enable Geneve TLV option flow matching::
+
+   FLEX_PARSER_PROFILE_ENABLE=0
+
 - enable GTP flow matching::
 
FLEX_PARSER_PROFILE_ENABLE=3
-- 
1.8.3.1



Re: [dpdk-dev] [PATCH 1/8] lib/librte_ethdev: introduce GENEVE header TLV option item

2020-12-27 Thread Stephen Hemminger
On Sun, 27 Dec 2020 18:06:16 +0200
Shiri Kuzin  wrote:

> +#ifdef PEDANTIC
> +#pragma GCC diagnostic ignored "-Wpedantic"
> +#endif

Please do not introduce pragma's for pedantic in standard headers.
It just clutters the code unnecessarily. The rest of DPDK is not guaranteed
to be free of pedantic warnings, so starting now is not worth the mess.


[dpdk-dev] [PATCH v2] mlx5: split multi-threaded flows per OS

2020-12-27 Thread Tal Shnaiderman
multi-threaded flows feature uses pthread function pthread_key_create
but for Windows the destruction option in the function is unimplemented.

to resolve it Windows will implement destruction mechanism to cleanup
mlx5_flow_workspace object for each terminated thread.

Linux flow will keep the current behavior.

Signed-off-by: Tal Shnaiderman 
Acked-by: Matan Azrad 
---
Depends-on: patch-85737 ("eal: add generic thread-local-storage functions")
v2: fix style issues
---
 drivers/net/mlx5/linux/meson.build  |   1 +
 drivers/net/mlx5/linux/mlx5_flow_os.c   |  39 +++
 drivers/net/mlx5/mlx5.c |   8 ++
 drivers/net/mlx5/mlx5_flow.c|  29 +-
 drivers/net/mlx5/mlx5_flow.h|  10 ++
 drivers/net/mlx5/windows/mlx5_flow_os.c | 179 
 6 files changed, 242 insertions(+), 24 deletions(-)
 create mode 100644 drivers/net/mlx5/linux/mlx5_flow_os.c

diff --git a/drivers/net/mlx5/linux/meson.build 
b/drivers/net/mlx5/linux/meson.build
index 6c4402169e..8412edce78 100644
--- a/drivers/net/mlx5/linux/meson.build
+++ b/drivers/net/mlx5/linux/meson.build
@@ -9,5 +9,6 @@ sources += files(
'mlx5_verbs.c',
'mlx5_mp_os.c',
'mlx5_vlan_os.c',
+   'mlx5_flow_os.c',
 )
 
diff --git a/drivers/net/mlx5/linux/mlx5_flow_os.c 
b/drivers/net/mlx5/linux/mlx5_flow_os.c
new file mode 100644
index 00..8cf997a718
--- /dev/null
+++ b/drivers/net/mlx5/linux/mlx5_flow_os.c
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include "mlx5_flow_os.h"
+
+#include 
+
+/* Key of thread specific flow workspace data. */
+static rte_tls_key key_workspace;
+
+int
+mlx5_flow_os_init_workspace_once(void)
+{
+   if (rte_thread_tls_create_key(&key_workspace, flow_release_workspace)) {
+   DRV_LOG(ERR, "Can't create flow workspace data thread key.");
+   return -ENOMEM;
+   }
+   return 0;
+}
+
+void *
+mlx5_flow_os_get_specific_workspace(void)
+{
+   return rte_thread_tls_get_value(key_workspace);
+}
+
+int
+mlx5_flow_os_set_specific_workspace(struct mlx5_flow_workspace *data)
+{
+   return rte_thread_tls_set_value(key_workspace, data);
+}
+
+void
+mlx5_flow_os_release_workspace(void)
+{
+   rte_thread_tls_delete_key(key_workspace);
+}
+
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 2aa269b13e..0fdcd0fe8d 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1004,6 +1004,11 @@ mlx5_alloc_shared_dev_ctx(const struct 
mlx5_dev_spawn_data *spawn,
err = rte_errno;
goto error;
}
+   if (LIST_EMPTY(&mlx5_dev_ctx_list)) {
+   err = mlx5_flow_os_init_workspace_once();
+   if (err)
+   goto error;
+   }
mlx5_flow_aging_init(sh);
mlx5_flow_counters_mng_init(sh);
mlx5_flow_ipool_create(sh, config);
@@ -1079,6 +1084,9 @@ mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
mlx5_mr_release_cache(&sh->share_cache);
/* Remove context from the global device list. */
LIST_REMOVE(sh, next);
+   /* Release flow workspaces objects on the last device. */
+   if (LIST_EMPTY(&mlx5_dev_ctx_list))
+   mlx5_flow_os_release_workspace();
pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
/*
 *  Ensure there is no async event handler installed.
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index f110c6b714..a2a294eac2 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -696,11 +696,6 @@ static struct mlx5_flow_tunnel_info tunnels_info[] = {
},
 };
 
-/* Key of thread specific flow workspace data. */
-static pthread_key_t key_workspace;
-
-/* Thread specific flow workspace data once initialization data. */
-static pthread_once_t key_workspace_init;
 
 
 /**
@@ -5698,7 +5693,7 @@ mlx5_flow_start_default(struct rte_eth_dev *dev)
 /**
  * Release key of thread specific flow workspace data.
  */
-static void
+void
 flow_release_workspace(void *data)
 {
struct mlx5_flow_workspace *wks = data;
@@ -5712,16 +5707,6 @@ flow_release_workspace(void *data)
}
 }
 
-/**
- * Initialize key of thread specific flow workspace data.
- */
-static void
-flow_alloc_workspace(void)
-{
-   if (pthread_key_create(&key_workspace, flow_release_workspace))
-   DRV_LOG(ERR, "Can't create flow workspace data thread key.");
-}
-
 /**
  * Get thread specific current flow workspace.
  *
@@ -5732,7 +5717,7 @@ mlx5_flow_get_thread_workspace(void)
 {
struct mlx5_flow_workspace *data;
 
-   data = pthread_getspecific(key_workspace);
+   data = mlx5_flow_os_get_specific_workspace();
MLX5_ASSERT(data && data->inuse);
if (!data || !data->inuse)
DRV_LOG(ERR, "flow workspace not initialized.");
@@ -5780,11 +5765,7 @@ mlx5_flow_push_thread_workspace(vo

[dpdk-dev] [PATCH v6] net/iavf: fix invalid RSS combinations rule can be created

2020-12-27 Thread Murphy Yang
Currently, when use 'flow' command to create a rule that combine with
several RSS types, even the RSS type combination is invalid or unsupported,
it also be created successfully.

Here list some invalid RSS combinations:
 - ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP
 - ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP

Here list some currently unsupported RSS combinations:
 - ETH_RSS_GTPU | ETH_RSS_IPV4
 - ETH_RSS_GTPU | ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_UDP
 - ETH_RSS_GTPU | ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP
 - ETH_RSS_GTPU | ETH_RSS_IPV6
 - ETH_RSS_GTPU | ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_UDP
 - ETH_RSS_GTPU | ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP

For invalid RSS combinations, this
patch adds these combinations in 'invalid_rss_comb' array to do
valid check, if the combination check failed, the rule will be created
unsuccessful.

For unsupported RSS combinations, this patch adds these combinations in
'unsupported_rss_comb' array to do valid check, if the combination check
failed, the rule will be created unsuccessful.

Fixes: 91f27b2e39ab ("net/iavf: refactor RSS")

Signed-off-by: Murphy Yang 
---
v6:
- add unsupported RSS combinations array.
v5:
- remove 'ETH_RSS_GTPU' from input set mask.
v4:
- use 'ETH_RSS_XXX' replace 'IAVF_RSS_TYPE_INNER_XXX'
v3:
- update the comments.
v2:
- add invalid RSS combinations.
 drivers/net/iavf/iavf_hash.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/iavf/iavf_hash.c b/drivers/net/iavf/iavf_hash.c
index c4c73e6644..0061eb6652 100644
--- a/drivers/net/iavf/iavf_hash.c
+++ b/drivers/net/iavf/iavf_hash.c
@@ -806,12 +806,23 @@ static void iavf_refine_proto_hdrs(struct 
virtchnl_proto_hdrs *proto_hdrs,
 
 static uint64_t invalid_rss_comb[] = {
ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_UDP,
+   ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP,
ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_UDP,
+   ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP,
RTE_ETH_RSS_L3_PRE32 | RTE_ETH_RSS_L3_PRE40 |
RTE_ETH_RSS_L3_PRE48 | RTE_ETH_RSS_L3_PRE56 |
RTE_ETH_RSS_L3_PRE96
 };
 
+static uint64_t unsupported_rss_comb[] = {
+   ETH_RSS_GTPU | ETH_RSS_IPV4,
+   ETH_RSS_GTPU | ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_UDP,
+   ETH_RSS_GTPU | ETH_RSS_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP,
+   ETH_RSS_GTPU | ETH_RSS_IPV6,
+   ETH_RSS_GTPU | ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_UDP,
+   ETH_RSS_GTPU | ETH_RSS_IPV6 | ETH_RSS_NONFRAG_IPV6_TCP
+};
+
 struct rss_attr_type {
uint64_t attr;
uint64_t type;
@@ -875,6 +886,13 @@ iavf_any_invalid_rss_type(enum rte_eth_hash_function 
rss_func,
return true;
}
 
+   /* check unsupported rss combination */
+   for (i = 0; i < RTE_DIM(unsupported_rss_comb); i++) {
+   if (__builtin_popcountll(rss_type &
+   unsupported_rss_comb[i]) > 1)
+   return true;
+   }
+
/* check invalid RSS attribute */
for (i = 0; i < RTE_DIM(rss_attr_to_valid_type); i++) {
struct rss_attr_type *rat = &rss_attr_to_valid_type[i];
-- 
2.17.1



Re: [dpdk-dev] [PATCH v4 1/2] examples/vhost: add ioat ring space count and check

2020-12-27 Thread Hu, Jiayu
Hi Cheng,

> -Original Message-
> From: Jiang, Cheng1 
> Sent: Friday, December 25, 2020 4:07 PM
> To: maxime.coque...@redhat.com; Xia, Chenbo 
> Cc: dev@dpdk.org; Hu, Jiayu ; Yang, YvonneX
> ; Jiang, Cheng1 
> Subject: [PATCH v4 1/2] examples/vhost: add ioat ring space count and check
> 
> Add ioat ring space count and check, if ioat ring space is not enough
> for the next async vhost packet enqueue, then just return to prevent
> enqueue failure.
> 
> Signed-off-by: Cheng Jiang 
> ---
>  examples/vhost/ioat.c | 15 +++
>  1 file changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
> index 71d8a1f1f..b0b04aa45 100644
> --- a/examples/vhost/ioat.c
> +++ b/examples/vhost/ioat.c
> @@ -17,6 +17,7 @@ struct packet_tracker {
>   unsigned short next_read;
>   unsigned short next_write;
>   unsigned short last_remain;
> + unsigned short ioat_space;
>  };
> 
>  struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
> @@ -113,7 +114,7 @@ open_ioat(const char *value)
>   goto out;
>   }
>   rte_rawdev_start(dev_id);
> -
> + cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE;
>   dma_info->nr++;
>   i++;
>   }
> @@ -140,13 +141,9 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>   src = descs[i_desc].src;
>   dst = descs[i_desc].dst;
>   i_seg = 0;
> + if (cb_tracker[dev_id].ioat_space < src->nr_segs)
> + break;
>   while (i_seg < src->nr_segs) {
> - /*
> -  * TODO: Assuming that the ring space of the
> -  * IOAT device is large enough, so there is no
> -  * error here, and the actual error handling
> -  * will be added later.
> -  */
>   rte_ioat_enqueue_copy(dev_id,
>   (uintptr_t)(src->iov[i_seg].iov_base)
>   + src->offset,
> @@ -158,7 +155,8 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
>   i_seg++;
>   }
>   write &= mask;
> - cb_tracker[dev_id].size_track[write] = i_seg;
> + cb_tracker[dev_id].size_track[write] = src->nr_segs;
> + cb_tracker[dev_id].ioat_space -= src->nr_segs;
>   write++;
>   }
>   } else {
> @@ -186,6 +184,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t
> queue_id,
>   int dev_id = dma_bind[vid].dmas[queue_id * 2
>   + VIRTIO_RXQ].dev_id;
>   n_seg = rte_ioat_completed_ops(dev_id, 255, dump, dump);
> + cb_tracker[dev_id].ioat_space += n_seg;

rte_ioat_completed_ops() may fail. In this case, its return value is -1, which
will cause n_seg to 65534.

Thanks,
Jiayu

>   n_seg += cb_tracker[dev_id].last_remain;
>   if (!n_seg)
>   return 0;
> --
> 2.29.2



[dpdk-dev] [PATCH] common/sfc_efx/base: fix MPORT-related byte order handling

2020-12-27 Thread Ivan Malov
MPORT values derived by helper functions are little-endian.
At the same time, MCDIs which consume these values perform
one more host-order to little-endian conversion internally.

Fix the helper functions to return host-order MPORT values.

Fixes: 370ed675a952 ("common/sfc_efx/base: support setting PPORT in match spec")
Fixes: bb024542fffd ("common/sfc_efx/base: add API for adding action drop")
Fixes: 097058033f03 ("common/sfc_efx/base: add API to get mport of PF/VF")
Cc: sta...@dpdk.org

Reported-by: Andy Moreton 
Reviewed-by: Andy Moreton 
Reviewed-by: Andrew Rybchenko 
Signed-off-by: Ivan Malov 
---
 drivers/common/sfc_efx/base/efx_mae.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/common/sfc_efx/base/efx_mae.c 
b/drivers/common/sfc_efx/base/efx_mae.c
index a54d5f6e6..22f29d454 100644
--- a/drivers/common/sfc_efx/base/efx_mae.c
+++ b/drivers/common/sfc_efx/base/efx_mae.c
@@ -593,7 +593,13 @@ efx_mae_mport_by_phy_port(
MAE_MPORT_SELECTOR_PPORT_ID, phy_port);
 
memset(mportp, 0, sizeof (*mportp));
-   mportp->sel = dword.ed_u32[0];
+   /*
+* The constructed DWORD is little-endian,
+* but the resulting value is meant to be
+* passed to MCDIs, where it will undergo
+* host-order to little endian conversion.
+*/
+   mportp->sel = EFX_DWORD_FIELD(dword, EFX_DWORD_0);
 
return (0);
 
@@ -630,7 +636,13 @@ efx_mae_mport_by_pcie_function(
MAE_MPORT_SELECTOR_FUNC_VF_ID, vf);
 
memset(mportp, 0, sizeof (*mportp));
-   mportp->sel = dword.ed_u32[0];
+   /*
+* The constructed DWORD is little-endian,
+* but the resulting value is meant to be
+* passed to MCDIs, where it will undergo
+* host-order to little endian conversion.
+*/
+   mportp->sel = EFX_DWORD_FIELD(dword, EFX_DWORD_0);
 
return (0);
 
@@ -1319,7 +1331,13 @@ efx_mae_action_set_populate_drop(
EFX_POPULATE_DWORD_1(dword,
MAE_MPORT_SELECTOR_FLAT, MAE_MPORT_SELECTOR_NULL);
 
-   mport.sel = dword.ed_u32[0];
+   /*
+* The constructed DWORD is little-endian,
+* but the resulting value is meant to be
+* passed to MCDIs, where it will undergo
+* host-order to little endian conversion.
+*/
+   mport.sel = EFX_DWORD_FIELD(dword, EFX_DWORD_0);
 
arg = (const uint8_t *)&mport.sel;
 
-- 
2.20.1



Re: [dpdk-dev] [PATCH v4 2/2] examples/vhost: refactor vhost data path

2020-12-27 Thread Hu, Jiayu
Hi Cheng,

Some comments are inline.

Thanks,
Jiayu
> -Original Message-
> From: Jiang, Cheng1 
> Sent: Friday, December 25, 2020 4:07 PM
> To: maxime.coque...@redhat.com; Xia, Chenbo 
> Cc: dev@dpdk.org; Hu, Jiayu ; Yang, YvonneX
> ; Jiang, Cheng1 
> Subject: [PATCH v4 2/2] examples/vhost: refactor vhost data path
> 
> Change the vm2vm data path to batch enqueue for better performance.
> Support latest async vhost API, refactor vhost async data path,
> replase rte_atomicNN_xxx to atomic_XXX and clean some codes.

Typo: replase -> replace

> 
> Signed-off-by: Cheng Jiang 
> ---
>  examples/vhost/main.c | 202 +++---
>  examples/vhost/main.h |   7 +-
>  2 files changed, 154 insertions(+), 55 deletions(-)
> 
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> index 8d8c3038b..3ea12a474 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -179,9 +179,18 @@ struct mbuf_table {
>   struct rte_mbuf *m_table[MAX_PKT_BURST];
>  };
> 
> +struct vhost_bufftable {
> + uint32_t len;
> + uint64_t pre_tsc;
> + struct rte_mbuf *m_table[MAX_PKT_BURST];
> +};
> +
>  /* TX queue for each data core. */
>  struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
> 
> +/* TX queue for each vhost device. */

Every lcore maintains a TX buffer for every vhost device,
which is to batch pkts to enqueue for higher performance.
I suggest you to update the description of vhost_txbuff above,
as it is not very clear.

> +struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE *
> MAX_VHOST_DEVICE];
> +
>  #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
>/ US_PER_S * BURST_TX_DRAIN_US)
>  #define VLAN_HLEN   4
> @@ -804,39 +813,114 @@ unlink_vmdq(struct vhost_dev *vdev)
>   }
>  }
> 
> +static inline void
> +free_pkts(struct rte_mbuf **pkts, uint16_t n)
> +{
> + while (n--)
> + rte_pktmbuf_free(pkts[n]);
> +}
> +
>  static __rte_always_inline void
> -virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
> +complete_async_pkts(struct vhost_dev *vdev)
> +{
> + struct rte_mbuf *p_cpl[MAX_PKT_BURST];
> + uint16_t complete_count;
> +
> + complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
> + VIRTIO_RXQ, p_cpl,
> MAX_PKT_BURST);
> + if (complete_count) {
> + atomic_fetch_sub(&vdev->nr_async_pkts, complete_count);
> + free_pkts(p_cpl, complete_count);
> + }
> +}
> +
> +static __rte_always_inline void
> +sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
>   struct rte_mbuf *m)
>  {
>   uint16_t ret;
> - struct rte_mbuf *m_cpl[1];
> 
>   if (builtin_net_driver) {
>   ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
> - } else if (async_vhost_driver) {
> - ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid,
> VIRTIO_RXQ,
> - &m, 1);
> -
> - if (likely(ret))
> - dst_vdev->nr_async_pkts++;
> -
> - while (likely(dst_vdev->nr_async_pkts)) {
> - if (rte_vhost_poll_enqueue_completed(dst_vdev-
> >vid,
> - VIRTIO_RXQ, m_cpl, 1))
> - dst_vdev->nr_async_pkts--;
> - }
>   } else {
>   ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
> &m, 1);
>   }
> 
>   if (enable_stats) {
> - rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
> - rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
> + atomic_fetch_add(&dst_vdev->stats.rx_total_atomic, 1);
> + atomic_fetch_add(&dst_vdev->stats.rx_atomic, ret);
>   src_vdev->stats.tx_total++;
>   src_vdev->stats.tx += ret;
>   }
>  }
> 
> +static __rte_always_inline void
> +drain_vhost(struct vhost_dev *vdev)
> +{
> + uint16_t ret;
> + uint64_t queue_id = rte_lcore_id() * MAX_VHOST_DEVICE + vdev-
> >vid;
> + uint16_t nr_xmit = vhost_txbuff[queue_id]->len;
> + struct rte_mbuf **m = vhost_txbuff[queue_id]->m_table;

"queue_id" is not a very good name, as it's not the queue id of vhost device,
but a buffer index which holds pkts to enqueue.

> +
> + if (builtin_net_driver) {
> + ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
> + } else if (async_vhost_driver) {
> + uint32_t cpu_cpl_nr = 0;
> + uint16_t enqueue_fail = 0;
> + struct rte_mbuf *m_cpu_cpl[nr_xmit];
> +
> + complete_async_pkts(vdev);
> + ret = rte_vhost_submit_enqueue_burst(vdev->vid,
> VIRTIO_RXQ,
> + m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
> + atomic_fetch_add(&vdev->nr_async_pkts, ret - cpu_cpl_nr);
> +
> + if (cpu_cpl_nr)
> + free_pkts(m_cpu_cpl, cpu_cpl_nr);
> +
> +

[dpdk-dev] [PATCH v3 2/5] net/iavf: support Ethernet CRC strip disable

2020-12-27 Thread Haiyue Wang
The VF will check the PF's CRC strip capability firstly, then set the
'CRC strip disable' value in the queue configuration according to the
RX CRC offload setting.

Signed-off-by: Haiyue Wang 
---
 drivers/net/iavf/iavf_ethdev.c | 3 +++
 drivers/net/iavf/iavf_rxtx.c   | 6 +-
 drivers/net/iavf/iavf_vchnl.c  | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index d2fa16825..75361b73b 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -798,6 +798,9 @@ iavf_dev_info_get(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)
DEV_TX_OFFLOAD_MULTI_SEGS |
DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
+   if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_CRC)
+   dev_info->rx_offload_capa |= DEV_RX_OFFLOAD_KEEP_CRC;
+
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
.rx_drop_en = 0,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 21d508b3f..d53d7b984 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -550,11 +550,15 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t 
queue_idx,
rxq->rx_free_thresh = rx_free_thresh;
rxq->queue_id = queue_idx;
rxq->port_id = dev->data->port_id;
-   rxq->crc_len = 0; /* crc stripping by default */
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
 
+   if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
+   rxq->crc_len = RTE_ETHER_CRC_LEN;
+   else
+   rxq->crc_len = 0;
+
len = rte_pktmbuf_data_room_size(rxq->mp) - RTE_PKTMBUF_HEADROOM;
rxq->rx_buf_len = RTE_ALIGN(len, (1 << IAVF_RXQ_CTX_DBUFF_SHIFT));
 
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 25d5cdaf5..c33194cdc 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -458,6 +458,7 @@ iavf_get_vf_resource(struct iavf_adapter *adapter)
VIRTCHNL_VF_OFFLOAD_FDIR_PF |
VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF |
VIRTCHNL_VF_OFFLOAD_REQ_QUEUES |
+   VIRTCHNL_VF_OFFLOAD_CRC |
VIRTCHNL_VF_LARGE_NUM_QPAIRS;
 
args.in_args = (uint8_t *)∩︀
@@ -853,7 +854,7 @@ iavf_configure_queues(struct iavf_adapter *adapter,
vc_qp->rxq.ring_len = rxq[i]->nb_rx_desc;
vc_qp->rxq.dma_ring_addr = rxq[i]->rx_ring_phys_addr;
vc_qp->rxq.databuffer_size = rxq[i]->rx_buf_len;
-
+   vc_qp->rxq.crc_disable = rxq[i]->crc_len != 0 ? 1 : 0;
 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
if (vf->vf_res->vf_cap_flags &
VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC &&
-- 
2.29.2



[dpdk-dev] [PATCH v3 1/5] common/iavf: new VLAN opcode

2020-12-27 Thread Haiyue Wang
Add new VLAN opcode support.

Signed-off-by: Haiyue Wang 
---
 drivers/common/iavf/virtchnl.h | 259 +
 1 file changed, 259 insertions(+)

diff --git a/drivers/common/iavf/virtchnl.h b/drivers/common/iavf/virtchnl.h
index fcbaa31fa..13788e46b 100644
--- a/drivers/common/iavf/virtchnl.h
+++ b/drivers/common/iavf/virtchnl.h
@@ -129,6 +129,7 @@ enum virtchnl_ops {
VIRTCHNL_OP_ADD_CLOUD_FILTER = 32,
VIRTCHNL_OP_DEL_CLOUD_FILTER = 33,
/* opcodes 34, 35, 36, 37 and 38 are reserved */
+   VIRTCHNL_OP_DCF_VLAN_OFFLOAD = 38,
VIRTCHNL_OP_DCF_CMD_DESC = 39,
VIRTCHNL_OP_DCF_CMD_BUFF = 40,
VIRTCHNL_OP_DCF_DISABLE = 41,
@@ -141,6 +142,11 @@ enum virtchnl_ops {
VIRTCHNL_OP_DEL_FDIR_FILTER = 48,
VIRTCHNL_OP_QUERY_FDIR_FILTER = 49,
VIRTCHNL_OP_GET_MAX_RSS_QREGION = 50,
+   VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS = 51,
+   VIRTCHNL_OP_ADD_VLAN_V2 = 52,
+   VIRTCHNL_OP_DEL_VLAN_V2 = 53,
+   VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 = 54,
+   VIRTCHNL_OP_DISABLE_VLAN_STRIPPING_V2 = 55,
VIRTCHNL_OP_ENABLE_QUEUES_V2 = 107,
VIRTCHNL_OP_DISABLE_QUEUES_V2 = 108,
VIRTCHNL_OP_MAP_QUEUE_VECTOR = 111,
@@ -251,6 +257,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_CRC0x0080
/* 0X0100 is reserved */
 #define VIRTCHNL_VF_LARGE_NUM_QPAIRS   0x0200
+#define VIRTCHNL_VF_OFFLOAD_VLAN_V20x8000
 #define VIRTCHNL_VF_OFFLOAD_VLAN   0x0001
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING 0x0002
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2  0x0004
@@ -536,6 +543,202 @@ struct virtchnl_vlan_filter_list {
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_vlan_filter_list);
 
+/* This enum is used for all of the VIRTCHNL_VF_OFFLOAD_VLAN_V2_CAPS related
+ * structures and opcodes.
+ *
+ * VIRTCHNL_VLAN_UNSUPPORTED - This field is not supported and if a VF driver
+ * populates it the PF should return VIRTCHNL_STATUS_ERR_NOT_SUPPORTED.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 - This field supports 0x8100 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 - This field supports 0x88A8 ethertype.
+ * VIRTCHNL_VLAN_ETHERTYPE_9100 - This field supports 0x9100 ethertype.
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_AND - Used when multiple ethertypes can be supported
+ * by the PF concurrently. For example, if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 AND VIRTCHNL_VLAN_ETHERTYPE_88A8 filters it
+ * would OR the following in the virtchnl_vlan_filtering_caps.outer field:
+ *
+ * VIRTHCNL_VLAN_ETHERTYPE_8100 |
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ * VIRTCHNL_VLAN_ETHERTYPE_AND;
+ *
+ * The VF would interpret this as VLAN filtering can be supported on both 
0x8100
+ * and 0x88A8 VLAN ethertypes.
+ *
+ * VIRTCHNL_ETHERTYPE_XOR - Used when only a single ethertype can be supported
+ * by the PF concurrently. For example if the PF can support
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 XOR VIRTCHNL_VLAN_ETHERTYPE_88A8 stripping
+ * offload it would OR the following in the
+ * virtchnl_vlan_offload_caps.outer_stripping field:
+ *
+ * VIRTCHNL_VLAN_ETHERTYPE_8100 |
+ * VIRTCHNL_VLAN_ETHERTYPE_88A8 |
+ * VIRTCHNL_VLAN_ETHERTYPE_XOR;
+ *
+ * The VF would interpret this as VLAN stripping can be supported on either
+ * 0x8100 or 0x88a8 VLAN ethertypes. So when requesting VLAN stripping via
+ * VIRTCHNL_OP_ENABLE_VLAN_STRIPPING_V2 the specified ethertype will override
+ * the previously set value.
+ *
+ * VIRTCHNL_VLAN_PRIO - This field supports VLAN priority bits. This is used 
for
+ * VLAN filtering if the underlying PF supports it.
+ *
+ * VIRTCHNL_VLAN_TOGGLE_ALLOWED - This field is used to say whether a
+ * certain VLAN capability can be toggled. For example if the underlying PF/CP
+ * allows the VF to toggle VLAN filtering, stripping, and/or insertion it 
should
+ * set this bit along with the supported ethertypes.
+ */
+enum virtchnl_vlan_support {
+   VIRTCHNL_VLAN_UNSUPPORTED = 0,
+   VIRTCHNL_VLAN_ETHERTYPE_8100 =  0x0001,
+   VIRTCHNL_VLAN_ETHERTYPE_88A8 =  0x0002,
+   VIRTCHNL_VLAN_ETHERTYPE_9100 =  0x0004,
+   VIRTCHNL_VLAN_PRIO =0x0100,
+   VIRTCHNL_VLAN_FILTER_MASK = 0x1000,
+   VIRTCHNL_VLAN_ETHERTYPE_AND =   0x2000,
+   VIRTCHNL_VLAN_ETHERTYPE_XOR =   0x4000,
+   VIRTCHNL_VLAN_TOGGLE =  0x8000,
+};
+
+/* The PF populates these fields based on the supported VLAN filtering. If a
+ * field is VIRTCHNL_VLAN_UNSUPPORTED then it's not supported and the PF will
+ * reject any VIRTCHNL_OP_ADD_VLAN_V2 or VIRTCHNL_OP_DEL_VLAN_V2 messages using
+ * the unsupported fields.
+ *
+ * Also, a VF is only allowed to toggle its VLAN filtering setting if the
+ * VIRTCHNL_VFLAN_TOGGLE_ALLOWED bit is set.
+ *
+ * The max_filters field tells the VF how many VLAN filters it's allowed to 
have
+ * at any one time. If it exceeds this amount and tries to add an

[dpdk-dev] [PATCH v3 3/5] net/ice: enable QinQ filter for switch

2020-12-27 Thread Haiyue Wang
Enable the double VLAN support for QinQ filter switch.

Signed-off-by: Wei Zhao 
Signed-off-by: Haiyue Wang 
---
 drivers/net/ice/ice_generic_flow.c  |   8 +++
 drivers/net/ice/ice_generic_flow.h  |   1 +
 drivers/net/ice/ice_switch_filter.c | 104 +---
 3 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ice/ice_generic_flow.c 
b/drivers/net/ice/ice_generic_flow.c
index 1429cbc3b..1712d3b2e 100644
--- a/drivers/net/ice/ice_generic_flow.c
+++ b/drivers/net/ice/ice_generic_flow.c
@@ -1455,6 +1455,14 @@ enum rte_flow_item_type pattern_eth_qinq_pppoes[] = {
RTE_FLOW_ITEM_TYPE_PPPOES,
RTE_FLOW_ITEM_TYPE_END,
 };
+enum rte_flow_item_type pattern_eth_qinq_pppoes_proto[] = {
+   RTE_FLOW_ITEM_TYPE_ETH,
+   RTE_FLOW_ITEM_TYPE_VLAN,
+   RTE_FLOW_ITEM_TYPE_VLAN,
+   RTE_FLOW_ITEM_TYPE_PPPOES,
+   RTE_FLOW_ITEM_TYPE_PPPOE_PROTO_ID,
+   RTE_FLOW_ITEM_TYPE_END,
+};
 enum rte_flow_item_type pattern_eth_pppoes_ipv4[] = {
RTE_FLOW_ITEM_TYPE_ETH,
RTE_FLOW_ITEM_TYPE_PPPOES,
diff --git a/drivers/net/ice/ice_generic_flow.h 
b/drivers/net/ice/ice_generic_flow.h
index 434d2f425..dc45d8dc6 100644
--- a/drivers/net/ice/ice_generic_flow.h
+++ b/drivers/net/ice/ice_generic_flow.h
@@ -426,6 +426,7 @@ extern enum rte_flow_item_type pattern_eth_pppoes_proto[];
 extern enum rte_flow_item_type pattern_eth_vlan_pppoes[];
 extern enum rte_flow_item_type pattern_eth_vlan_pppoes_proto[];
 extern enum rte_flow_item_type pattern_eth_qinq_pppoes[];
+extern enum rte_flow_item_type pattern_eth_qinq_pppoes_proto[];
 extern enum rte_flow_item_type pattern_eth_pppoes_ipv4[];
 extern enum rte_flow_item_type pattern_eth_vlan_pppoes_ipv4[];
 extern enum rte_flow_item_type pattern_eth_qinq_pppoes_ipv4[];
diff --git a/drivers/net/ice/ice_switch_filter.c 
b/drivers/net/ice/ice_switch_filter.c
index 8cba6eb7b..43c755e30 100644
--- a/drivers/net/ice/ice_switch_filter.c
+++ b/drivers/net/ice/ice_switch_filter.c
@@ -35,11 +35,15 @@
 #define ICE_SW_INSET_ETHER ( \
ICE_INSET_DMAC | ICE_INSET_SMAC | ICE_INSET_ETHERTYPE)
 #define ICE_SW_INSET_MAC_VLAN ( \
-   ICE_INSET_DMAC | ICE_INSET_SMAC | ICE_INSET_ETHERTYPE | \
-   ICE_INSET_VLAN_OUTER)
+   ICE_INSET_DMAC | ICE_INSET_SMAC | ICE_INSET_ETHERTYPE | \
+   ICE_INSET_VLAN_INNER)
+#define ICE_SW_INSET_MAC_QINQ  ( \
+   ICE_SW_INSET_MAC_VLAN | ICE_INSET_VLAN_OUTER)
 #define ICE_SW_INSET_MAC_IPV4 ( \
ICE_INSET_DMAC | ICE_INSET_IPV4_DST | ICE_INSET_IPV4_SRC | \
ICE_INSET_IPV4_PROTO | ICE_INSET_IPV4_TTL | ICE_INSET_IPV4_TOS)
+#define ICE_SW_INSET_MAC_QINQ_IPV4 ( \
+   ICE_SW_INSET_MAC_QINQ | ICE_SW_INSET_MAC_IPV4)
 #define ICE_SW_INSET_MAC_IPV4_TCP ( \
ICE_INSET_DMAC | ICE_INSET_IPV4_DST | ICE_INSET_IPV4_SRC | \
ICE_INSET_IPV4_TTL | ICE_INSET_IPV4_TOS | \
@@ -52,6 +56,8 @@
ICE_INSET_DMAC | ICE_INSET_IPV6_DST | ICE_INSET_IPV6_SRC | \
ICE_INSET_IPV6_TC | ICE_INSET_IPV6_HOP_LIMIT | \
ICE_INSET_IPV6_NEXT_HDR)
+#define ICE_SW_INSET_MAC_QINQ_IPV6 ( \
+   ICE_SW_INSET_MAC_QINQ | ICE_SW_INSET_MAC_IPV6)
 #define ICE_SW_INSET_MAC_IPV6_TCP ( \
ICE_INSET_DMAC | ICE_INSET_IPV6_DST | ICE_INSET_IPV6_SRC | \
ICE_INSET_IPV6_HOP_LIMIT | ICE_INSET_IPV6_TC | \
@@ -182,6 +188,8 @@ ice_pattern_match_item ice_switch_pattern_dist_comms[] = {
ICE_SW_INSET_ETHER, ICE_INSET_NONE},
{pattern_ethertype_vlan,
ICE_SW_INSET_MAC_VLAN, ICE_INSET_NONE},
+   {pattern_ethertype_qinq,
+   ICE_SW_INSET_MAC_QINQ, ICE_INSET_NONE},
{pattern_eth_arp,
ICE_INSET_NONE, ICE_INSET_NONE},
{pattern_eth_ipv4,
@@ -262,6 +270,18 @@ ice_pattern_match_item ice_switch_pattern_dist_comms[] = {
ICE_INSET_NONE, ICE_INSET_NONE},
{pattern_eth_ipv6_pfcp,
ICE_INSET_NONE, ICE_INSET_NONE},
+   {pattern_eth_qinq_ipv4,
+   ICE_SW_INSET_MAC_QINQ_IPV4, ICE_INSET_NONE},
+   {pattern_eth_qinq_ipv6,
+   ICE_SW_INSET_MAC_QINQ_IPV6, ICE_INSET_NONE},
+   {pattern_eth_qinq_pppoes,
+   ICE_SW_INSET_MAC_PPPOE, ICE_INSET_NONE},
+   {pattern_eth_qinq_pppoes_proto,
+   ICE_SW_INSET_MAC_PPPOE_PROTO, ICE_INSET_NONE},
+   {pattern_eth_qinq_pppoes_ipv4,
+   ICE_SW_INSET_MAC_PPPOE_IPV4, ICE_INSET_NONE},
+   {pattern_eth_qinq_pppoes_ipv6,
+   ICE_SW_INSET_MAC_PPPOE_IPV6, ICE_INSET_NONE},
 };
 
 static struct
@@ -304,6 +324,8 @@ ice_pattern_match_item ice_switch_pattern_perm_comms[] = {
ICE_SW_INSET_ETHER, ICE_INSET_NONE},
{pattern_ethertype_vlan,
ICE_SW_INSET_MAC_VLAN, ICE_INSET_NONE},
+   {pattern_ethertype_qinq,
+   ICE_SW_INSET_MAC_QINQ, ICE_INSET_NONE},
{pattern_eth_arp,

[dpdk-dev] [PATCH v3 0/5] Add AVF & DCF VLAN feaure

2020-12-27 Thread Haiyue Wang
Add new VLAN feature, which has rich settings.

v3: code refactor for handing QinQ according to the VC response.

Haiyue Wang (5):
  common/iavf: new VLAN opcode
  net/iavf: support Ethernet CRC strip disable
  net/ice: enable QinQ filter for switch
  net/ice: add the DCF VLAN handling
  net/iavf: implement new VLAN capability handling

 drivers/common/iavf/virtchnl.h   | 259 +
 drivers/net/iavf/iavf.h  |  10 +
 drivers/net/iavf/iavf_ethdev.c   | 110 +++
 drivers/net/iavf/iavf_rxtx.c |   6 +-
 drivers/net/iavf/iavf_vchnl.c| 144 -
 drivers/net/ice/ice_dcf.c|   1 +
 drivers/net/ice/ice_dcf_ethdev.c |  91 +-
 drivers/net/ice/ice_dcf_ethdev.h |  20 ++
 drivers/net/ice/ice_dcf_vf_representor.c | 356 +++
 drivers/net/ice/ice_generic_flow.c   |   8 +
 drivers/net/ice/ice_generic_flow.h   |   1 +
 drivers/net/ice/ice_switch_filter.c  | 104 ++-
 drivers/net/ice/meson.build  |   1 +
 13 files changed, 1091 insertions(+), 20 deletions(-)
 create mode 100644 drivers/net/ice/ice_dcf_vf_representor.c

-- 
2.29.2



[dpdk-dev] [PATCH v3 5/5] net/iavf: implement new VLAN capability handling

2020-12-27 Thread Haiyue Wang
The new VLAN virtchnl opcodes introduce new settings like different TPID
filtering, stripping.

Signed-off-by: Qiming Yang 
Signed-off-by: Haiyue Wang 
---
 drivers/net/iavf/iavf.h|  10 +++
 drivers/net/iavf/iavf_ethdev.c | 107 +
 drivers/net/iavf/iavf_vchnl.c  | 141 +
 3 files changed, 258 insertions(+)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 9754273b2..c5d53bd9c 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -139,6 +139,7 @@ struct iavf_info {
struct virtchnl_version_info virtchnl_version;
struct virtchnl_vf_resource *vf_res; /* VF resource */
struct virtchnl_vsi_resource *vsi_res; /* LAN VSI */
+   struct virtchnl_vlan_caps vlan_v2_caps;
uint64_t supported_rxdid;
uint8_t *proto_xtr; /* proto xtr type for all queues */
volatile enum virtchnl_ops pend_cmd; /* pending command not finished */
@@ -173,6 +174,10 @@ struct iavf_info {
struct iavf_fdir_info fdir; /* flow director info */
/* indicate large VF support enabled or not */
bool lv_enabled;
+
+   /* used to set the VLAN Ethernet type for virtchnl VLAN V2 */
+   uint16_t outer_vlan_tpid;
+   uint16_t inner_vlan_tpid;
 };
 
 #define IAVF_MAX_PKT_TYPE 1024
@@ -297,6 +302,8 @@ int iavf_get_vf_resource(struct iavf_adapter *adapter);
 void iavf_handle_virtchnl_msg(struct rte_eth_dev *dev);
 int iavf_enable_vlan_strip(struct iavf_adapter *adapter);
 int iavf_disable_vlan_strip(struct iavf_adapter *adapter);
+int iavf_config_vlan_strip_v2(struct iavf_adapter *adapter, uint16_t tpid,
+ bool enable);
 int iavf_switch_queue(struct iavf_adapter *adapter, uint16_t qid,
 bool rx, bool on);
 int iavf_switch_queue_lv(struct iavf_adapter *adapter, uint16_t qid,
@@ -310,6 +317,7 @@ int iavf_configure_rss_key(struct iavf_adapter *adapter);
 int iavf_configure_queues(struct iavf_adapter *adapter,
uint16_t num_queue_pairs, uint16_t index);
 int iavf_get_supported_rxdid(struct iavf_adapter *adapter);
+int iavf_get_vlan_offload_caps_v2(struct iavf_adapter *adapter);
 int iavf_config_irq_map(struct iavf_adapter *adapter);
 int iavf_config_irq_map_lv(struct iavf_adapter *adapter, uint16_t num,
uint16_t index);
@@ -323,6 +331,8 @@ int iavf_config_promisc(struct iavf_adapter *adapter, bool 
enable_unicast,
 int iavf_add_del_eth_addr(struct iavf_adapter *adapter,
 struct rte_ether_addr *addr, bool add);
 int iavf_add_del_vlan(struct iavf_adapter *adapter, uint16_t vlanid, bool add);
+int iavf_add_del_vlan_v2(struct iavf_adapter *adapter, uint16_t tpid,
+uint16_t vlanid, bool add);
 int iavf_fdir_add(struct iavf_adapter *adapter, struct iavf_fdir_conf *filter);
 int iavf_fdir_del(struct iavf_adapter *adapter, struct iavf_fdir_conf *filter);
 int iavf_fdir_check(struct iavf_adapter *adapter,
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 75361b73b..d6771c0d9 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -100,6 +100,8 @@ static void iavf_dev_del_mac_addr(struct rte_eth_dev *dev, 
uint32_t index);
 static int iavf_dev_vlan_filter_set(struct rte_eth_dev *dev,
   uint16_t vlan_id, int on);
 static int iavf_dev_vlan_offload_set(struct rte_eth_dev *dev, int mask);
+static int iavf_dev_vlan_tpid_set(struct rte_eth_dev *dev,
+ enum rte_vlan_type vlan_type, uint16_t tpid);
 static int iavf_dev_rss_reta_update(struct rte_eth_dev *dev,
   struct rte_eth_rss_reta_entry64 *reta_conf,
   uint16_t reta_size);
@@ -176,6 +178,7 @@ static const struct eth_dev_ops iavf_eth_dev_ops = {
.mac_addr_remove= iavf_dev_del_mac_addr,
.set_mc_addr_list   = iavf_set_mc_addr_list,
.vlan_filter_set= iavf_dev_vlan_filter_set,
+   .vlan_tpid_set  = iavf_dev_vlan_tpid_set,
.vlan_offload_set   = iavf_dev_vlan_offload_set,
.rx_queue_start = iavf_dev_rx_queue_start,
.rx_queue_stop  = iavf_dev_rx_queue_stop,
@@ -326,6 +329,18 @@ iavf_queues_req_reset(struct rte_eth_dev *dev, uint16_t 
num)
return 0;
 }
 
+static inline uint16_t
+iavf_curr_vlan_tpid(struct rte_eth_dev *dev)
+{
+   struct iavf_adapter *adapter =
+   IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+   struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(adapter);
+   bool qinq = !!(dev->data->dev_conf.rxmode.offloads &
+  DEV_RX_OFFLOAD_VLAN_EXTEND);
+
+   return qinq ? vf->outer_vlan_tpid : vf->inner_vlan_tpid;
+}
+
 static int
 iavf_dev_configure(struct rte_eth_dev *dev)
 {
@@ -387,6 +402,12 @@ iavf_dev_configure(struct rt

[dpdk-dev] [PATCH v3 4/5] net/ice: add the DCF VLAN handling

2020-12-27 Thread Haiyue Wang
Add the DCF port representor infrastructure for the VFs of DCF attached
PF. Then the standard ethdev API like VLAN can be used to configure the
VFs.

Signed-off-by: Qiming Yang 
Signed-off-by: Haiyue Wang 
---
 drivers/net/ice/ice_dcf.c|   1 +
 drivers/net/ice/ice_dcf_ethdev.c |  91 +-
 drivers/net/ice/ice_dcf_ethdev.h |  20 ++
 drivers/net/ice/ice_dcf_vf_representor.c | 356 +++
 drivers/net/ice/meson.build  |   1 +
 5 files changed, 462 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/ice/ice_dcf_vf_representor.c

diff --git a/drivers/net/ice/ice_dcf.c b/drivers/net/ice/ice_dcf.c
index 44dbd3bb8..4a9af3292 100644
--- a/drivers/net/ice/ice_dcf.c
+++ b/drivers/net/ice/ice_dcf.c
@@ -234,6 +234,7 @@ ice_dcf_get_vf_resource(struct ice_dcf_hw *hw)
 
caps = VIRTCHNL_VF_OFFLOAD_WB_ON_ITR | VIRTCHNL_VF_OFFLOAD_RX_POLLING |
   VIRTCHNL_VF_CAP_ADV_LINK_SPEED | VIRTCHNL_VF_CAP_DCF |
+  VIRTCHNL_VF_OFFLOAD_VLAN_V2 |
   VF_BASE_MODE_OFFLOADS | VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC;
 
err = ice_dcf_send_cmd_req_no_irq(hw, VIRTCHNL_OP_GET_VF_RESOURCES,
diff --git a/drivers/net/ice/ice_dcf_ethdev.c b/drivers/net/ice/ice_dcf_ethdev.c
index b0b2ecb0d..a9e78064d 100644
--- a/drivers/net/ice/ice_dcf_ethdev.c
+++ b/drivers/net/ice/ice_dcf_ethdev.c
@@ -970,20 +970,97 @@ ice_dcf_cap_selected(struct rte_devargs *devargs)
return ret;
 }
 
-static int eth_ice_dcf_pci_probe(__rte_unused struct rte_pci_driver *pci_drv,
-struct rte_pci_device *pci_dev)
+static int
+eth_ice_dcf_pci_probe(__rte_unused struct rte_pci_driver *pci_drv,
+ struct rte_pci_device *pci_dev)
 {
+   struct rte_eth_devargs eth_da = { .nb_representor_ports = 0 };
+   struct ice_dcf_vf_repr_param repr_param;
+   char repr_name[RTE_ETH_NAME_MAX_LEN];
+   struct ice_dcf_adapter *dcf_adapter;
+   struct rte_eth_dev *dcf_ethdev;
+   uint16_t dcf_vsi_id;
+   int i, ret;
+
if (!ice_dcf_cap_selected(pci_dev->device.devargs))
return 1;
 
-   return rte_eth_dev_pci_generic_probe(pci_dev,
-sizeof(struct ice_dcf_adapter),
-ice_dcf_dev_init);
+   ret = rte_eth_devargs_parse(pci_dev->device.devargs->args, ð_da);
+   if (ret)
+   return ret;
+
+   ret = rte_eth_dev_pci_generic_probe(pci_dev,
+   sizeof(struct ice_dcf_adapter),
+   ice_dcf_dev_init);
+   if (ret || !eth_da.nb_representor_ports)
+   return ret;
+
+   dcf_ethdev = rte_eth_dev_allocated(pci_dev->device.name);
+   if (dcf_ethdev == NULL)
+   return -ENODEV;
+
+   dcf_adapter = dcf_ethdev->data->dev_private;
+
+   if (eth_da.nb_representor_ports > dcf_adapter->real_hw.num_vfs ||
+   eth_da.nb_representor_ports >= RTE_MAX_ETHPORTS) {
+   PMD_DRV_LOG(ERR, "the number of port representors is too large: 
%u",
+   eth_da.nb_representor_ports);
+   return -EINVAL;
+   }
+
+   dcf_vsi_id = dcf_adapter->real_hw.vsi_id | VIRTCHNL_DCF_VF_VSI_VALID;
+
+   repr_param.adapter = dcf_adapter;
+   repr_param.switch_domain_id = 0;
+
+   for (i = 0; i < eth_da.nb_representor_ports; i++) {
+   uint16_t vf_id = eth_da.representor_ports[i];
+
+   if (vf_id >= dcf_adapter->real_hw.num_vfs) {
+   PMD_DRV_LOG(ERR, "VF ID %u is out of range (0 ~ %u)",
+   vf_id, dcf_adapter->real_hw.num_vfs - 1);
+   ret = -EINVAL;
+   break;
+   }
+
+   if (dcf_adapter->real_hw.vf_vsi_map[vf_id] == dcf_vsi_id) {
+   PMD_DRV_LOG(ERR, "VF ID %u is DCF's ID.\n", vf_id);
+   ret = -EINVAL;
+   break;
+   }
+
+   repr_param.vf_id = vf_id;
+   snprintf(repr_name, sizeof(repr_name), "net_%s_representor_%u",
+pci_dev->device.name, vf_id);
+   ret = rte_eth_dev_create(&pci_dev->device, repr_name,
+sizeof(struct ice_dcf_vf_repr),
+NULL, NULL, ice_dcf_vf_repr_init,
+&repr_param);
+   if (ret) {
+   PMD_DRV_LOG(ERR, "failed to create DCF VF representor 
%s",
+   repr_name);
+   break;
+   }
+   }
+
+   return ret;
 }
 
-static int eth_ice_dcf_pci_remove(struct rte_pci_device *pci_dev)
+static int
+eth_ice_dcf_pci_remove(struct rte_pci_device *pci_dev)
 {
-   return rte_eth_dev_pci_generic_remove(pci_dev, ice_dcf_dev_uninit);
+   struct

[dpdk-dev] [PATCH] net/i40e: fix flex payload rule conflict issue

2020-12-27 Thread beilei . xing
From: Beilei Xing 

With the following commands, the second flow can't
be created successfully.

1. flow create 0 ingress pattern eth / ipv4 / udp /
   raw relative is 1 pattern is 0102030405 / end
   actions drop / end
2. flow destroy 0 rule 0
3. flow create 0 ingress pattern eth / ipv4 / udp /
   raw relative is 1 pattern is 010203040506 / end
   actions drop / end

The root cause is that a flag for flex pit isn't reset.

Reported-by: Chenmin Sun
Signed-off-by: Beilei Xing 
---
 drivers/net/i40e/i40e_flow.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index b09ff6590d..3a68274a23 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -5284,6 +5284,7 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
enum rte_filter_type filter_type = flow->filter_type;
struct i40e_fdir_info *fdir_info = &pf->fdir;
int ret = 0;
+   int i;
 
switch (filter_type) {
case RTE_ETH_FILTER_ETHERTYPE:
@@ -5299,9 +5300,10 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
&((struct i40e_fdir_filter *)flow->rule)->fdir,
0);
 
-   /* If the last flow is destroyed, disable fdir. */
if (!ret && TAILQ_EMPTY(&pf->fdir.fdir_list)) {
i40e_fdir_rx_proc_enable(dev, 0);
+   for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
+   pf->fdir.flex_pit_flag[i] = 0;
}
break;
case RTE_ETH_FILTER_HASH:
@@ -5515,6 +5517,9 @@ i40e_flow_flush_fdir_filter(struct i40e_pf *pf)
pf->fdir.flex_mask_flag[pctype] = 0;
}
 
+   for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
+   pf->fdir.flex_pit_flag[i] = 0;
+
/* Disable FDIR processing as all FDIR rules are now flushed */
i40e_fdir_rx_proc_enable(dev, 0);
}
-- 
2.26.2



[dpdk-dev] [PATCH v2] net/i40e: fix flex payload rule conflict issue

2020-12-27 Thread beilei . xing
From: Beilei Xing 

With the following commands, the second flow can't
be created successfully.

1. flow create 0 ingress pattern eth / ipv4 / udp /
   raw relative is 1 pattern is 0102030405 / end
   actions drop / end
2. flow destroy 0 rule 0
3. flow create 0 ingress pattern eth / ipv4 / udp /
   raw relative is 1 pattern is 010203040506 / end
   actions drop / end

The root cause is that a flag for flex pit isn't reset.

Fixes: 6ced3dd72f5f ("net/i40e: support flexible payload parsing for FDIR")
Cc: sta...@dpdk.org

Reported-by: Chenmin Sun
Signed-off-by: Beilei Xing 
---

v2 changes:
 - Add fix line.
 - Refine comments.

 drivers/net/i40e/i40e_flow.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
index b09ff6590d..65e0b69356 100644
--- a/drivers/net/i40e/i40e_flow.c
+++ b/drivers/net/i40e/i40e_flow.c
@@ -5284,6 +5284,7 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
enum rte_filter_type filter_type = flow->filter_type;
struct i40e_fdir_info *fdir_info = &pf->fdir;
int ret = 0;
+   int i;
 
switch (filter_type) {
case RTE_ETH_FILTER_ETHERTYPE:
@@ -5299,9 +5300,13 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
&((struct i40e_fdir_filter *)flow->rule)->fdir,
0);
 
-   /* If the last flow is destroyed, disable fdir. */
+   /* When the last flow is destroyed. */
if (!ret && TAILQ_EMPTY(&pf->fdir.fdir_list)) {
+   /* Disable FDIR processing. */
i40e_fdir_rx_proc_enable(dev, 0);
+   /* Reset the flex_pit_flag. */
+   for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
+   pf->fdir.flex_pit_flag[i] = 0;
}
break;
case RTE_ETH_FILTER_HASH:
@@ -5515,6 +5520,9 @@ i40e_flow_flush_fdir_filter(struct i40e_pf *pf)
pf->fdir.flex_mask_flag[pctype] = 0;
}
 
+   for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
+   pf->fdir.flex_pit_flag[i] = 0;
+
/* Disable FDIR processing as all FDIR rules are now flushed */
i40e_fdir_rx_proc_enable(dev, 0);
}
-- 
2.26.2



[dpdk-dev] [PATCH v2 1/3] doc: fix testpmd command for i40e RSS flow

2020-12-27 Thread Zhang,Alvin
From: Alvin Zhang 

The command here does not create a queue region, but only sets the
lookup table, so the descriptions in the doc is not exact.

Signed-off-by: Alvin Zhang 

Fixes: feaae285b342 ("net/i40e: support hash configuration in RSS flow")
Cc: sta...@dpdk.org
---

V2: Divide the patch into three patch series and delete some two unused
functions

---
 doc/guides/nics/i40e.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/guides/nics/i40e.rst b/doc/guides/nics/i40e.rst
index 4e5c4679b..64f20e7da 100644
--- a/doc/guides/nics/i40e.rst
+++ b/doc/guides/nics/i40e.rst
@@ -562,9 +562,9 @@ Generic flow API
 - ``RSS Flow``
 
   RSS Flow supports to set hash input set, hash function, enable hash
-  and configure queue region.
+  and configure queues.
   For example:
-  Configure queue region as queue 0, 1, 2, 3.
+  Configure queues as queue 0, 1, 2, 3.
 
   .. code-block:: console
 
-- 
2.21.0.windows.1



[dpdk-dev] [PATCH v2 2/3] net/i40e: fix return value

2020-12-27 Thread Zhang,Alvin
From: Alvin Zhang 

The api should return the system error status, but it returned the
hardware error status, this is confused for the caller.
This patch adds check on hardware execution status and returns -EIO
in case of hardware execution failure.

Signed-off-by: Alvin Zhang 

Fixes: 1d4b2b4966bb ("net/i40e: fix VF overwrite PF RSS LUT for X722")
Fixes: d0a349409bd7 ("i40e: support AQ based RSS config")
Cc: sta...@dpdk.org
---
 drivers/net/i40e/i40e_ethdev.c | 33 -
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index f54769c29..20340084b 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -4426,7 +4426,6 @@ i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, 
uint16_t lut_size)
 {
struct i40e_pf *pf;
struct i40e_hw *hw;
-   int ret;
 
if (!vsi || !lut)
return -EINVAL;
@@ -4435,12 +4434,16 @@ i40e_set_rss_lut(struct i40e_vsi *vsi, uint8_t *lut, 
uint16_t lut_size)
hw = I40E_VSI_TO_HW(vsi);
 
if (pf->flags & I40E_FLAG_RSS_AQ_CAPABLE) {
-   ret = i40e_aq_set_rss_lut(hw, vsi->vsi_id,
- vsi->type != I40E_VSI_SRIOV,
- lut, lut_size);
-   if (ret) {
-   PMD_DRV_LOG(ERR, "Failed to set RSS lookup table");
-   return ret;
+   enum i40e_status_code status;
+
+   status = i40e_aq_set_rss_lut(hw, vsi->vsi_id,
+vsi->type != I40E_VSI_SRIOV,
+lut, lut_size);
+   if (status) {
+   PMD_DRV_LOG(ERR,
+   "Failed to update RSS lookup table, error 
status: %d",
+   status);
+   return -EIO;
}
} else {
uint32_t *lut_dw = (uint32_t *)lut;
@@ -7591,7 +7594,6 @@ i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, 
uint8_t key_len)
uint16_t key_idx = (vsi->type == I40E_VSI_SRIOV) ?
   I40E_VFQF_HKEY_MAX_INDEX :
   I40E_PFQF_HKEY_MAX_INDEX;
-   int ret = 0;
 
if (!key || key_len == 0) {
PMD_DRV_LOG(DEBUG, "No key to be configured");
@@ -7604,11 +7606,16 @@ i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, 
uint8_t key_len)
 
if (pf->flags & I40E_FLAG_RSS_AQ_CAPABLE) {
struct i40e_aqc_get_set_rss_key_data *key_dw =
-   (struct i40e_aqc_get_set_rss_key_data *)key;
+   (struct i40e_aqc_get_set_rss_key_data *)key;
+   enum i40e_status_code status =
+   i40e_aq_set_rss_key(hw, vsi->vsi_id, key_dw);
 
-   ret = i40e_aq_set_rss_key(hw, vsi->vsi_id, key_dw);
-   if (ret)
-   PMD_INIT_LOG(ERR, "Failed to configure RSS key via AQ");
+   if (status) {
+   PMD_DRV_LOG(ERR,
+   "Failed to configure RSS key via AQ, error 
status: %d",
+   status);
+   return -EIO;
+   }
} else {
uint32_t *hash_key = (uint32_t *)key;
uint16_t i;
@@ -7628,7 +7635,7 @@ i40e_set_rss_key(struct i40e_vsi *vsi, uint8_t *key, 
uint8_t key_len)
I40E_WRITE_FLUSH(hw);
}
 
-   return ret;
+   return 0;
 }
 
 static int
-- 
2.21.0.windows.1



[dpdk-dev] [PATCH v2 3/3] net/i40e: refactor RSS flow

2020-12-27 Thread Zhang,Alvin
From: Alvin Zhang 

1. Delete original code.
2. Add 2 tables(One maps flow pattern and RSS type to PCTYPE,
   another maps RSS type to input set).
3. Parse RSS pattern and RSS type to get PCTYPE.
4. Parse RSS action to get queues, RSS function and hash field.
5. Create and destroy RSS filters.
6. Create new files for hash flows.

Signed-off-by: Alvin Zhang 
---
 drivers/net/i40e/i40e_ethdev.c |  905 -
 drivers/net/i40e/i40e_ethdev.h |   53 +-
 drivers/net/i40e/i40e_flow.c   |  617 +-
 drivers/net/i40e/i40e_hash.c   | 1380 
 drivers/net/i40e/i40e_hash.h   |   34 +
 drivers/net/i40e/meson.build   |1 +
 6 files changed, 1622 insertions(+), 1368 deletions(-)
 create mode 100644 drivers/net/i40e/i40e_hash.c
 create mode 100644 drivers/net/i40e/i40e_hash.h

diff --git a/drivers/net/i40e/i40e_ethdev.c b/drivers/net/i40e/i40e_ethdev.c
index 20340084b..b8c2cf3b3 100644
--- a/drivers/net/i40e/i40e_ethdev.c
+++ b/drivers/net/i40e/i40e_ethdev.c
@@ -39,6 +39,7 @@
 #include "i40e_pf.h"
 #include "i40e_regs.h"
 #include "rte_pmd_i40e.h"
+#include "i40e_hash.h"
 
 #define ETH_I40E_FLOATING_VEB_ARG  "enable_floating_veb"
 #define ETH_I40E_FLOATING_VEB_LIST_ARG "floating_veb_list"
@@ -396,7 +397,6 @@ static void i40e_ethertype_filter_restore(struct i40e_pf 
*pf);
 static void i40e_tunnel_filter_restore(struct i40e_pf *pf);
 static void i40e_filter_restore(struct i40e_pf *pf);
 static void i40e_notify_all_vfs_link_status(struct rte_eth_dev *dev);
-static int i40e_pf_config_rss(struct i40e_pf *pf);
 
 static const char *const valid_keys[] = {
ETH_I40E_FLOATING_VEB_ARG,
@@ -1764,10 +1764,6 @@ eth_i40e_dev_init(struct rte_eth_dev *dev, void 
*init_params __rte_unused)
/* initialize queue region configuration */
i40e_init_queue_region_conf(dev);
 
-   /* initialize RSS configuration from rte_flow */
-   memset(&pf->rss_info, 0,
-   sizeof(struct i40e_rte_flow_rss_conf));
-
/* reset all stats of the device, including pf and main vsi */
i40e_dev_stats_reset(dev);
 
@@ -7576,7 +7572,7 @@ i40e_parse_hena(const struct i40e_adapter *adapter, 
uint64_t flags)
 }
 
 /* Disable RSS */
-static void
+void
 i40e_pf_disable_rss(struct i40e_pf *pf)
 {
struct i40e_hw *hw = I40E_PF_TO_HW(pf);
@@ -8789,7 +8785,7 @@ i40e_dev_udp_tunnel_port_del(struct rte_eth_dev *dev,
 }
 
 /* Calculate the maximum number of contiguous PF queues that are configured */
-static int
+int
 i40e_pf_calc_configured_queues_num(struct i40e_pf *pf)
 {
struct rte_eth_dev_data *data = pf->dev_data;
@@ -8808,19 +8804,72 @@ i40e_pf_calc_configured_queues_num(struct i40e_pf *pf)
return num;
 }
 
-/* Configure RSS */
-static int
-i40e_pf_config_rss(struct i40e_pf *pf)
+/* Reset the global configure of hash function and input sets */
+static void
+i40e_pf_global_rss_reset(struct i40e_pf *pf)
 {
-   enum rte_eth_rx_mq_mode mq_mode = pf->dev_data->dev_conf.rxmode.mq_mode;
struct i40e_hw *hw = I40E_PF_TO_HW(pf);
-   struct rte_eth_rss_conf rss_conf;
-   uint32_t i, lut = 0;
-   uint16_t j, num;
+   uint32_t reg, reg_val;
+   int i;
 
-   /*
-* If both VMDQ and RSS enabled, not all of PF queues are configured.
-* It's necessary to calculate the actual PF queues that are configured.
+   /* Reset global RSS function sets */
+   reg_val = i40e_read_rx_ctl(hw, I40E_GLQF_CTL);
+   if (!(reg_val & I40E_GLQF_CTL_HTOEP_MASK)) {
+   reg_val |= I40E_GLQF_CTL_HTOEP_MASK;
+   i40e_write_global_rx_ctl(hw, I40E_GLQF_CTL, reg_val);
+   }
+
+   for (i = 0; i <= I40E_FILTER_PCTYPE_L2_PAYLOAD; i++) {
+   uint64_t inset;
+   int j, pctype;
+
+   if (hw->mac.type == I40E_MAC_X722)
+   pctype = i40e_read_rx_ctl(hw, I40E_GLQF_FD_PCTYPES(i));
+   else
+   pctype = i;
+
+   /* Reset pctype insets */
+   inset = i40e_get_default_input_set(i);
+   if (inset) {
+   pf->hash_input_set[pctype] = inset;
+   inset = i40e_translate_input_set_reg(hw->mac.type,
+inset);
+
+   reg = I40E_GLQF_HASH_INSET(0, pctype);
+   i40e_check_write_global_reg(hw, reg, (uint32_t)inset);
+   reg = I40E_GLQF_HASH_INSET(1, pctype);
+   i40e_check_write_global_reg(hw, reg,
+   (uint32_t)(inset >> 32));
+
+   /* Clear unused mask registers of the pctype */
+   for (j = 0; j < I40E_INSET_MASK_NUM_REG; j++) {
+   reg = I40E_GLQF_HASH_MSK(j, pctype);
+   i40e_check_write_global_reg(hw, reg, 0);
+   }
+   }
+
+  

Re: [dpdk-dev] [PATCH v2] net/i40e: fix flex payload rule conflict issue

2020-12-27 Thread Guo, Jia
Hi, beilei

> -Original Message-
> From: Xing, Beilei 
> Sent: Tuesday, December 29, 2020 2:18 PM
> To: Guo, Jia ; dev@dpdk.org
> Cc: Xing, Beilei ; sta...@dpdk.org; Sun, Chenmin
> 
> Subject: [PATCH v2] net/i40e: fix flex payload rule conflict issue
> 
> From: Beilei Xing 
> 
> With the following commands, the second flow can't be created successfully.
> 
> 1. flow create 0 ingress pattern eth / ipv4 / udp /
>raw relative is 1 pattern is 0102030405 / end
>actions drop / end
> 2. flow destroy 0 rule 0
> 3. flow create 0 ingress pattern eth / ipv4 / udp /
>raw relative is 1 pattern is 010203040506 / end
>actions drop / end
> 
> The root cause is that a flag for flex pit isn't reset.
> 
> Fixes: 6ced3dd72f5f ("net/i40e: support flexible payload parsing for FDIR")
> Cc: sta...@dpdk.org
> 
> Reported-by: Chenmin Sun
> Signed-off-by: Beilei Xing 
> ---
> 
> v2 changes:
>  - Add fix line.
>  - Refine comments.
> 
>  drivers/net/i40e/i40e_flow.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/i40e/i40e_flow.c b/drivers/net/i40e/i40e_flow.c
> index b09ff6590d..65e0b69356 100644
> --- a/drivers/net/i40e/i40e_flow.c
> +++ b/drivers/net/i40e/i40e_flow.c
> @@ -5284,6 +5284,7 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
>   enum rte_filter_type filter_type = flow->filter_type;
>   struct i40e_fdir_info *fdir_info = &pf->fdir;
>   int ret = 0;
> + int i;
> 
>   switch (filter_type) {
>   case RTE_ETH_FILTER_ETHERTYPE:
> @@ -5299,9 +5300,13 @@ i40e_flow_destroy(struct rte_eth_dev *dev,
>   &((struct i40e_fdir_filter *)flow->rule)->fdir,
>   0);
> 
> - /* If the last flow is destroyed, disable fdir. */
> + /* When the last flow is destroyed. */
>   if (!ret && TAILQ_EMPTY(&pf->fdir.fdir_list)) {
> + /* Disable FDIR processing. */
>   i40e_fdir_rx_proc_enable(dev, 0);
> + /* Reset the flex_pit_flag. */
> + for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
> + pf->fdir.flex_pit_flag[i] = 0;

Why reset all flex_pit_flag when destroy the last flow, if destroy other flow, 
is it no need to reset corresponding flex_pit_flag which set before when the 
flow added?
And reset flag should be ahead of the FDIR disabling I think.

>   }
>   break;
>   case RTE_ETH_FILTER_HASH:
> @@ -5515,6 +5520,9 @@ i40e_flow_flush_fdir_filter(struct i40e_pf *pf)
>   pf->fdir.flex_mask_flag[pctype] = 0;
>   }
> 
> + for (i = 0; i < I40E_MAX_FLXPLD_LAYER; i++)
> + pf->fdir.flex_pit_flag[i] = 0;
> +
>   /* Disable FDIR processing as all FDIR rules are now flushed
> */
>   i40e_fdir_rx_proc_enable(dev, 0);
>   }
> --
> 2.26.2



[dpdk-dev] [PATCH v5 0/2] examples/vhost: sample code refactor

2020-12-27 Thread Cheng Jiang
Refactor the vhost sample code. Add ioat ring space count and check
in ioat callback, optimize vhost data path for batch enqueue, replace
rte_atomicNN_xxx to atomic_XXX and refactor vhost async data path.
---
v5:
 * added vhost enqueue buffer free when destroy a vhost device
 * added rte_ioat_completed_ops() fail handler
 * changed the behavior of drain_vhost_table() function
 * changed some variable names
 * changed some variable definition
 * added rte_zmalloc() fail handler
 * added some comments
 * fixed some typos

v4:
 * improved code structure
 * improved vhost enqueue buffer memory allocation
 * cleaned some codes

v3:
 * added some variable initiation
 * cleaned some codes

v2:
 * optimized patch structure
 * optimized git log
 * replaced rte_atomicNN_xxx to atomic_XXX

Cheng Jiang (2):
  examples/vhost: add ioat ring space count and check
  examples/vhost: refactor vhost data path

 examples/vhost/ioat.c |  22 +++--
 examples/vhost/main.c | 214 --
 examples/vhost/main.h |   7 +-
 3 files changed, 178 insertions(+), 65 deletions(-)

--
2.29.2



[dpdk-dev] [PATCH v5 1/2] examples/vhost: add ioat ring space count and check

2020-12-27 Thread Cheng Jiang
Add ioat ring space count and check, if ioat ring space is not enough
for the next async vhost packet enqueue, then just return to prevent
enqueue failure. Add rte_ioat_completed_ops() fail handler.

Signed-off-by: Cheng Jiang 
---
 examples/vhost/ioat.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
index 71d8a1f1f..679d1e2f5 100644
--- a/examples/vhost/ioat.c
+++ b/examples/vhost/ioat.c
@@ -17,6 +17,7 @@ struct packet_tracker {
unsigned short next_read;
unsigned short next_write;
unsigned short last_remain;
+   unsigned short ioat_space;
 };
 
 struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -113,7 +114,7 @@ open_ioat(const char *value)
goto out;
}
rte_rawdev_start(dev_id);
-
+   cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE;
dma_info->nr++;
i++;
}
@@ -140,13 +141,9 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
src = descs[i_desc].src;
dst = descs[i_desc].dst;
i_seg = 0;
+   if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+   break;
while (i_seg < src->nr_segs) {
-   /*
-* TODO: Assuming that the ring space of the
-* IOAT device is large enough, so there is no
-* error here, and the actual error handling
-* will be added later.
-*/
rte_ioat_enqueue_copy(dev_id,
(uintptr_t)(src->iov[i_seg].iov_base)
+ src->offset,
@@ -158,7 +155,8 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
i_seg++;
}
write &= mask;
-   cb_tracker[dev_id].size_track[write] = i_seg;
+   cb_tracker[dev_id].size_track[write] = src->nr_segs;
+   cb_tracker[dev_id].ioat_space -= src->nr_segs;
write++;
}
} else {
@@ -178,17 +176,21 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
 {
if (!opaque_data) {
uintptr_t dump[255];
-   unsigned short n_seg;
+   int n_seg;
unsigned short read, write;
unsigned short nb_packet = 0;
unsigned short mask = MAX_ENQUEUED_SIZE - 1;
unsigned short i;
+
int dev_id = dma_bind[vid].dmas[queue_id * 2
+ VIRTIO_RXQ].dev_id;
n_seg = rte_ioat_completed_ops(dev_id, 255, dump, dump);
-   n_seg += cb_tracker[dev_id].last_remain;
if (!n_seg)
return 0;
+
+   cb_tracker[dev_id].ioat_space += n_seg;
+   n_seg += cb_tracker[dev_id].last_remain;
+
read = cb_tracker[dev_id].next_read;
write = cb_tracker[dev_id].next_write;
for (i = 0; i < max_packets; i++) {
-- 
2.29.2



[dpdk-dev] [PATCH v5 2/2] examples/vhost: refactor vhost data path

2020-12-27 Thread Cheng Jiang
Change the vm2vm data path to batch enqueue for better performance.
Support latest async vhost API, refactor vhost async data path,
replace rte_atomicNN_xxx to atomic_XXX and clean some codes.

Signed-off-by: Cheng Jiang 
---
 examples/vhost/main.c | 214 --
 examples/vhost/main.h |   7 +-
 2 files changed, 166 insertions(+), 55 deletions(-)

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index 8d8c3038b..45976c93c 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -179,9 +179,22 @@ struct mbuf_table {
struct rte_mbuf *m_table[MAX_PKT_BURST];
 };
 
+struct vhost_bufftable {
+   uint32_t len;
+   uint64_t pre_tsc;
+   struct rte_mbuf *m_table[MAX_PKT_BURST];
+};
+
 /* TX queue for each data core. */
 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
 
+/*
+ * Vhost TX buffer for each data core.
+ * Every data core maintains a TX buffer for every vhost device,
+ * which is used for batch pkts enqueue for higher performance.
+ */
+struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
+
 #define MBUF_TABLE_DRAIN_TSC   ((rte_get_tsc_hz() + US_PER_S - 1) \
 / US_PER_S * BURST_TX_DRAIN_US)
 #define VLAN_HLEN   4
@@ -804,39 +817,112 @@ unlink_vmdq(struct vhost_dev *vdev)
}
 }
 
+static inline void
+free_pkts(struct rte_mbuf **pkts, uint16_t n)
+{
+   while (n--)
+   rte_pktmbuf_free(pkts[n]);
+}
+
 static __rte_always_inline void
-virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
+complete_async_pkts(struct vhost_dev *vdev)
+{
+   struct rte_mbuf *p_cpl[MAX_PKT_BURST];
+   uint16_t complete_count;
+
+   complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
+   VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
+   if (complete_count) {
+   atomic_fetch_sub(&vdev->nr_async_pkts, complete_count);
+   free_pkts(p_cpl, complete_count);
+   }
+}
+
+static __rte_always_inline void
+sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
struct rte_mbuf *m)
 {
uint16_t ret;
-   struct rte_mbuf *m_cpl[1];
 
if (builtin_net_driver) {
ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
-   } else if (async_vhost_driver) {
-   ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
-   &m, 1);
-
-   if (likely(ret))
-   dst_vdev->nr_async_pkts++;
-
-   while (likely(dst_vdev->nr_async_pkts)) {
-   if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
-   VIRTIO_RXQ, m_cpl, 1))
-   dst_vdev->nr_async_pkts--;
-   }
} else {
ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
}
 
if (enable_stats) {
-   rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
-   rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
+   atomic_fetch_add(&dst_vdev->stats.rx_total_atomic, 1);
+   atomic_fetch_add(&dst_vdev->stats.rx_atomic, ret);
src_vdev->stats.tx_total++;
src_vdev->stats.tx += ret;
}
 }
 
+static __rte_always_inline void
+drain_vhost(struct vhost_dev *vdev)
+{
+   uint16_t ret;
+   uint64_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
+   uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
+   struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
+
+   if (builtin_net_driver) {
+   ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
+   } else if (async_vhost_driver) {
+   uint32_t cpu_cpl_nr = 0;
+   uint16_t enqueue_fail = 0;
+   struct rte_mbuf *m_cpu_cpl[nr_xmit];
+
+   complete_async_pkts(vdev);
+   ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
+   m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
+   atomic_fetch_add(&vdev->nr_async_pkts, ret - cpu_cpl_nr);
+
+   if (cpu_cpl_nr)
+   free_pkts(m_cpu_cpl, cpu_cpl_nr);
+
+   enqueue_fail = nr_xmit - ret;
+   if (enqueue_fail)
+   free_pkts(&m[ret], nr_xmit - ret);
+   } else {
+   ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
+   m, nr_xmit);
+   }
+
+   if (enable_stats) {
+   atomic_fetch_add(&vdev->stats.rx_total_atomic, nr_xmit);
+   atomic_fetch_add(&vdev->stats.rx_atomic, ret);
+   }
+
+   if (!async_vhost_driver)
+   free_pkts(m, nr_xmit);
+}
+
+static __rte_always_inline void
+drain_vhost_table(void)
+{
+   uint16_t lcore_id = rte_lcore_id();
+   struct vhost_bufftable *v