[v1 0/4] Add support for modifying ECN in IPv4/IPv6 header
This patch set adds support for modifying ECN fields in IPv4/IPv6 header, and also adds support for modify_filed action in meter. Jiawei Wang (1): ethdev: add IPv4/IPv6 ECN header rewrite action Sean Zhang (3): common/mlx5: add modify ECN capability check net/mlx5: add support to modify ECN field net/mlx5: add modify field support in meter app/test-pmd/cmdline_flow.c | 3 +- doc/guides/nics/mlx5.rst | 4 +- drivers/common/mlx5/mlx5_devx_cmds.c | 3 ++ drivers/common/mlx5/mlx5_devx_cmds.h | 1 + drivers/common/mlx5/mlx5_prm.h | 62 - drivers/net/mlx5/mlx5_flow.c | 5 +- drivers/net/mlx5/mlx5_flow.h | 2 + drivers/net/mlx5/mlx5_flow_dv.c | 69 ++-- drivers/net/mlx5/mlx5_flow_meter.c | 2 +- lib/ethdev/rte_flow.h| 2 + 10 files changed, 143 insertions(+), 10 deletions(-) -- 2.31.1
[v1 2/4] common/mlx5: add modify ECN capability check
Flag outer_ip_ecn in header modify capabilities properties layout is added in order to check if the firmware supports modification of ecn field. Signed-off-by: Sean Zhang --- drivers/common/mlx5/mlx5_devx_cmds.c | 3 ++ drivers/common/mlx5/mlx5_devx_cmds.h | 1 + drivers/common/mlx5/mlx5_prm.h | 62 +++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c index d02ac2a678..72296c1ca3 100644 --- a/drivers/common/mlx5/mlx5_devx_cmds.c +++ b/drivers/common/mlx5/mlx5_devx_cmds.c @@ -1047,6 +1047,9 @@ mlx5_devx_cmd_query_hca_attr(void *ctx, attr->flow.tunnel_header_2_3 = MLX5_GET (flow_table_nic_cap, hcattr, ft_field_support_2_nic_receive.tunnel_header_2_3); + attr->modify_outer_ip_ecn = MLX5_GET + (flow_table_nic_cap, hcattr, +ft_header_modify_nic_receive.outer_ip_ecn); attr->pkt_integrity_match = mlx5_devx_query_pkt_integrity_match(hcattr); attr->inner_ipv4_ihl = MLX5_GET (flow_table_nic_cap, hcattr, diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h index 1bac18c59d..6e9e23f593 100644 --- a/drivers/common/mlx5/mlx5_devx_cmds.h +++ b/drivers/common/mlx5/mlx5_devx_cmds.h @@ -256,6 +256,7 @@ struct mlx5_hca_attr { uint32_t esw_mgr_vport_id_valid:1; /* E-Switch Mgr vport ID is valid. */ uint16_t esw_mgr_vport_id; /* E-Switch Mgr vport ID . */ uint16_t max_wqe_sz_sq; + uint32_t modify_outer_ip_ecn:1; }; /* LAG Context. */ diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h index 44b18225f6..caaa4c7fb9 100644 --- a/drivers/common/mlx5/mlx5_prm.h +++ b/drivers/common/mlx5/mlx5_prm.h @@ -744,6 +744,7 @@ enum mlx5_modification_field { MLX5_MODI_OUT_TCP_ACK_NUM, MLX5_MODI_IN_TCP_ACK_NUM = 0x5C, MLX5_MODI_GTP_TEID = 0x6E, + MLX5_MODI_OUT_IP_ECN = 0x73, }; /* Total number of metadata reg_c's. */ @@ -1884,6 +1885,62 @@ struct mlx5_ifc_roce_caps_bits { u8 reserved_at_20[0x7e0]; }; +struct mlx5_ifc_ft_fields_support_bits { + u8 outer_dmac[0x1]; + u8 outer_smac[0x1]; + u8 outer_ether_type[0x1]; + u8 reserved_at_3[0x1]; + u8 outer_first_prio[0x1]; + u8 outer_first_cfi[0x1]; + u8 outer_first_vid[0x1]; + u8 reserved_at_7[0x1]; + u8 outer_second_prio[0x1]; + u8 outer_second_cfi[0x1]; + u8 outer_second_vid[0x1]; + u8 reserved_at_b[0x1]; + u8 outer_sip[0x1]; + u8 outer_dip[0x1]; + u8 outer_frag[0x1]; + u8 outer_ip_protocol[0x1]; + u8 outer_ip_ecn[0x1]; + u8 outer_ip_dscp[0x1]; + u8 outer_udp_sport[0x1]; + u8 outer_udp_dport[0x1]; + u8 outer_tcp_sport[0x1]; + u8 outer_tcp_dport[0x1]; + u8 outer_tcp_flags[0x1]; + u8 outer_gre_protocol[0x1]; + u8 outer_gre_key[0x1]; + u8 outer_vxlan_vni[0x1]; + u8 reserved_at_1a[0x5]; + u8 source_eswitch_port[0x1]; + u8 inner_dmac[0x1]; + u8 inner_smac[0x1]; + u8 inner_ether_type[0x1]; + u8 reserved_at_23[0x1]; + u8 inner_first_prio[0x1]; + u8 inner_first_cfi[0x1]; + u8 inner_first_vid[0x1]; + u8 reserved_at_27[0x1]; + u8 inner_second_prio[0x1]; + u8 inner_second_cfi[0x1]; + u8 inner_second_vid[0x1]; + u8 reserved_at_2b[0x1]; + u8 inner_sip[0x1]; + u8 inner_dip[0x1]; + u8 inner_frag[0x1]; + u8 inner_ip_protocol[0x1]; + u8 inner_ip_ecn[0x1]; + u8 inner_ip_dscp[0x1]; + u8 inner_udp_sport[0x1]; + u8 inner_udp_dport[0x1]; + u8 inner_tcp_sport[0x1]; + u8 inner_tcp_dport[0x1]; + u8 inner_tcp_flags[0x1]; + u8 reserved_at_37[0x9]; + u8 reserved_at_40[0x40]; +}; + /* * Table 1872 - Flow Table Fields Supported 2 Format */ @@ -1923,7 +1980,10 @@ struct mlx5_ifc_flow_table_nic_cap_bits { flow_table_properties_nic_transmit_rdma; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer; - u8 reserved_at_e00[0x600]; + u8 reserved_at_e00[0x200]; + struct mlx5_ifc_ft_fields_support_bits + ft_header_modify_nic_receive; + u8 reserved_at_1080[0x380]; struct mlx5_ifc_ft_fields_support_2_bits ft_field_support_2_nic_receive; }; -- 2.31.1
[v1 3/4] net/mlx5: add support to modify ECN field
This patch is to support modify ECN field in IPv4/IPv6 header. Signed-off-by: Sean Zhang --- drivers/net/mlx5/mlx5_flow_dv.c | 20 1 file changed, 20 insertions(+) diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c index 1e9bd63635..e416eb5701 100644 --- a/drivers/net/mlx5/mlx5_flow_dv.c +++ b/drivers/net/mlx5/mlx5_flow_dv.c @@ -1449,6 +1449,9 @@ mlx5_flow_item_field_width(struct rte_eth_dev *dev, case RTE_FLOW_FIELD_POINTER: case RTE_FLOW_FIELD_VALUE: return inherit < 0 ? 0 : inherit; + case RTE_FLOW_FIELD_IPV4_ECN: + case RTE_FLOW_FIELD_IPV6_ECN: + return 2; default: MLX5_ASSERT(false); } @@ -1826,6 +1829,13 @@ mlx5_flow_field_id_to_modify_info (meta_count - width)) & meta_mask); } break; + case RTE_FLOW_FIELD_IPV4_ECN: + case RTE_FLOW_FIELD_IPV6_ECN: + info[idx] = (struct field_modify_info){1, 0, + MLX5_MODI_OUT_IP_ECN}; + if (mask) + mask[idx] = 0x3 >> (2 - width); + break; case RTE_FLOW_FIELD_POINTER: case RTE_FLOW_FIELD_VALUE: default: @@ -4825,6 +4835,7 @@ flow_dv_validate_action_modify_field(struct rte_eth_dev *dev, int ret = 0; struct mlx5_priv *priv = dev->data->dev_private; struct mlx5_sh_config *config = &priv->sh->config; + struct mlx5_hca_attr *hca_attr = &priv->sh->cdev->config.hca_attr; const struct rte_flow_action_modify_field *action_modify_field = action->conf; uint32_t dst_width = mlx5_flow_item_field_width(dev, @@ -4952,6 +4963,15 @@ flow_dv_validate_action_modify_field(struct rte_eth_dev *dev, RTE_FLOW_ERROR_TYPE_ACTION, action, "add and sub operations" " are not supported"); + if (action_modify_field->dst.field == RTE_FLOW_FIELD_IPV4_ECN || + action_modify_field->src.field == RTE_FLOW_FIELD_IPV4_ECN || + action_modify_field->dst.field == RTE_FLOW_FIELD_IPV6_ECN || + action_modify_field->src.field == RTE_FLOW_FIELD_IPV6_ECN) + if (!hca_attr->modify_outer_ip_ecn && + !attr->transfer && !attr->group) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "modifications of the ECN for current firmware is not supported"); return (action_modify_field->width / 32) + !!(action_modify_field->width % 32); } -- 2.31.1
[v1 4/4] net/mlx5: add modify field support in meter
This patch introduces MODIFY_FIELD action support in meter. User can create meter policy with MODIFY_FIELD action in green/yellow action. For example: testpmd> add port meter policy 0 21 g_actions modify_field op set dst_type ipv4_ecn src_type value src_value 3 width 2 / ... Signed-off-by: Sean Zhang --- doc/guides/nics/mlx5.rst | 4 +-- drivers/net/mlx5/mlx5_flow.c | 5 ++- drivers/net/mlx5/mlx5_flow.h | 2 ++ drivers/net/mlx5/mlx5_flow_dv.c| 49 +++--- drivers/net/mlx5/mlx5_flow_meter.c | 2 +- 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index 4805d08a76..b6d51dc7c9 100644 --- a/doc/guides/nics/mlx5.rst +++ b/doc/guides/nics/mlx5.rst @@ -443,8 +443,8 @@ Limitations - yellow: NULL or END. - RED: DROP / END. - The only supported meter policy actions: - - green: QUEUE, RSS, PORT_ID, REPRESENTED_PORT, JUMP, DROP, MARK and SET_TAG. - - yellow: QUEUE, RSS, PORT_ID, REPRESENTED_PORT, JUMP, DROP, MARK and SET_TAG. + - green: QUEUE, RSS, PORT_ID, REPRESENTED_PORT, JUMP, DROP, MODIFY_FIELD, MARK and SET_TAG. + - yellow: QUEUE, RSS, PORT_ID, REPRESENTED_PORT, JUMP, DROP, MODIFY_FIELD, MARK and SET_TAG. - RED: must be DROP. - Policy actions of RSS for green and yellow should have the same configuration except queues. - Policy with RSS/queue action is not supported when ``dv_xmeta_en`` enabled. diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c index 78cb38d42b..52b5463648 100644 --- a/drivers/net/mlx5/mlx5_flow.c +++ b/drivers/net/mlx5/mlx5_flow.c @@ -7867,6 +7867,8 @@ mlx5_flow_destroy_mtr_acts(struct rte_eth_dev *dev, * Meter policy struct. * @param[in] action * Action specification used to create meter actions. + * @param[in] attr + * Flow rule attributes. * @param[out] error * Perform verbose error reporting if not NULL. Initialized in case of * error only. @@ -7878,12 +7880,13 @@ int mlx5_flow_create_mtr_acts(struct rte_eth_dev *dev, struct mlx5_flow_meter_policy *mtr_policy, const struct rte_flow_action *actions[RTE_COLORS], + struct rte_flow_attr *attr, struct rte_mtr_error *error) { const struct mlx5_flow_driver_ops *fops; fops = flow_get_drv_ops(MLX5_FLOW_TYPE_DV); - return fops->create_mtr_acts(dev, mtr_policy, actions, error); + return fops->create_mtr_acts(dev, mtr_policy, actions, attr, error); } /** diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h index f56115dd11..ed9b9e4876 100644 --- a/drivers/net/mlx5/mlx5_flow.h +++ b/drivers/net/mlx5/mlx5_flow.h @@ -1359,6 +1359,7 @@ typedef int (*mlx5_flow_create_mtr_acts_t) (struct rte_eth_dev *dev, struct mlx5_flow_meter_policy *mtr_policy, const struct rte_flow_action *actions[RTE_COLORS], + struct rte_flow_attr *attr, struct rte_mtr_error *error); typedef void (*mlx5_flow_destroy_mtr_acts_t) (struct rte_eth_dev *dev, @@ -2017,6 +2018,7 @@ void mlx5_flow_destroy_mtr_acts(struct rte_eth_dev *dev, int mlx5_flow_create_mtr_acts(struct rte_eth_dev *dev, struct mlx5_flow_meter_policy *mtr_policy, const struct rte_flow_action *actions[RTE_COLORS], + struct rte_flow_attr *attr, struct rte_mtr_error *error); int mlx5_flow_create_policy_rules(struct rte_eth_dev *dev, struct mlx5_flow_meter_policy *mtr_policy); diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c index e416eb5701..a01ba04c3b 100644 --- a/drivers/net/mlx5/mlx5_flow_dv.c +++ b/drivers/net/mlx5/mlx5_flow_dv.c @@ -15579,6 +15579,8 @@ flow_dv_destroy_mtr_policy_acts(struct rte_eth_dev *dev, * Meter policy struct. * @param[in] action * Action specification used to create meter actions. + * @param[in] attr + * Pointer to the flow attributes. * @param[out] error * Perform verbose error reporting if not NULL. Initialized in case of * error only. @@ -15590,6 +15592,7 @@ static int __flow_dv_create_domain_policy_acts(struct rte_eth_dev *dev, struct mlx5_flow_meter_policy *mtr_policy, const struct rte_flow_action *actions[RTE_COLORS], + struct rte_flow_attr *attr, enum mlx5_meter_domain domain, struct rte_mtr_error *error) { @@ -15886,6 +15889,28 @@ __flow_dv_create_domain_policy_acts(struct rte_eth_dev *dev, action_flags |= MLX5_FLOW_ACTION_JUMP; break; } + case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD: + {
[v1 1/4] ethdev: add IPv4/IPv6 ECN header rewrite action
From: Jiawei Wang This patch introduces the IPv4/IPv6 ECN modify field support, and adds the testpmd CLI commands support. Usage: modify_field op set dst_type ipv4_ecn src_type ... For example: flow create 0 ingress group 1 pattern eth / ipv4 / end actions modify_field op set dst_type ipv4_ecn src_type value src_value 0x03 width 2 / queue index 0 / end Signed-off-by: Jiawei Wang --- app/test-pmd/cmdline_flow.c | 3 ++- lib/ethdev/rte_flow.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/app/test-pmd/cmdline_flow.c b/app/test-pmd/cmdline_flow.c index fc4a6d9cca..3250add834 100644 --- a/app/test-pmd/cmdline_flow.c +++ b/app/test-pmd/cmdline_flow.c @@ -795,7 +795,8 @@ static const char *const modify_field_ids[] = { "tcp_seq_num", "tcp_ack_num", "tcp_flags", "udp_port_src", "udp_port_dst", "vxlan_vni", "geneve_vni", "gtp_teid", - "tag", "mark", "meta", "pointer", "value", NULL + "tag", "mark", "meta", "pointer", "value", + "ipv4_ecn", "ipv6_ecn", NULL }; /** Maximum number of subsequent tokens and arguments on the stack. */ diff --git a/lib/ethdev/rte_flow.h b/lib/ethdev/rte_flow.h index d8827dd184..1b56f23cba 100644 --- a/lib/ethdev/rte_flow.h +++ b/lib/ethdev/rte_flow.h @@ -3690,6 +3690,8 @@ enum rte_flow_field_id { RTE_FLOW_FIELD_META,/**< Metadata value. */ RTE_FLOW_FIELD_POINTER, /**< Memory pointer. */ RTE_FLOW_FIELD_VALUE, /**< Immediate value. */ + RTE_FLOW_FIELD_IPV4_ECN,/**< IPv4 ECN. */ + RTE_FLOW_FIELD_IPV6_ECN,/**< IPv6 ECN. */ }; /** -- 2.31.1
Re: [PATCH v2] ci: add Fedora 35 container in GHA
On Fri, Apr 1, 2022 at 8:03 PM David Marchand wrote: > > Build DPDK with Fedora 35 containers. > > GHA container support does not allow caching images and docker hub > seems to limit image pulls. > On the other hand, the Fedora project hub does not seem to limit them, > so prefer this hub. > Nevertheless, let's try to be good citizens and cache (once a day) a > prepared image for subsequent builds. We could move to a pipeline approach, with a first stage job preparing the image and pushing to the cache, and the second stage pulling the image from the cache and starting compilations. Food for thoughts... > > Differences with the Ubuntu GHA vm images: > - tasks are run as root in containers, no need for sudo, > - compiler must be explicitly installed, > - GHA artifacts can't contain a ':' in their name, and must be filtered, > -- David Marchand
[PATCH] net/iavf: fix iavf crashed on dev_stop when running in multi-process mode
In the multi process environment, the sub process operates on the shared memory and changes the function pointer of the main process, resulting in the failure to find the address of the function when main preocess releasing, resulting in crash. similar with commit<20b631efe785819eb77aabbf500b3352e5731bdb> Signed-off-by: Ke Zhang --- drivers/net/iavf/iavf_rxtx.c| 27 ++--- drivers/net/iavf/iavf_rxtx.h| 6 +++--- drivers/net/iavf/iavf_rxtx_vec_avx512.c | 4 ++-- drivers/net/iavf/iavf_rxtx_vec_sse.c| 8 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 16e8d021f9..197c03cd31 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -362,6 +362,9 @@ release_txq_mbufs(struct iavf_tx_queue *txq) } } +const struct iavf_rxq_ops *iavf_rxq_release_mbufs_ops; +const struct iavf_txq_ops *iavf_txq_release_mbufs_ops; + static const struct iavf_rxq_ops def_rxq_ops = { .release_mbufs = release_rxq_mbufs, }; @@ -674,7 +677,7 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, rxq->q_set = true; dev->data->rx_queues[queue_idx] = rxq; rxq->qrx_tail = hw->hw_addr + IAVF_QRX_TAIL1(rxq->queue_id); - rxq->ops = &def_rxq_ops; + iavf_rxq_release_mbufs_ops = &def_rxq_ops; if (check_rx_bulk_allow(rxq) == true) { PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are " @@ -811,7 +814,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev, txq->q_set = true; dev->data->tx_queues[queue_idx] = txq; txq->qtx_tail = hw->hw_addr + IAVF_QTX_TAIL1(queue_idx); - txq->ops = &def_txq_ops; + iavf_txq_release_mbufs_ops = &def_txq_ops; if (check_tx_vec_allow(txq) == false) { struct iavf_adapter *ad = @@ -943,7 +946,7 @@ iavf_dev_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id) } rxq = dev->data->rx_queues[rx_queue_id]; - rxq->ops->release_mbufs(rxq); + iavf_rxq_release_mbufs_ops->release_mbufs(rxq); reset_rx_queue(rxq); dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; @@ -971,7 +974,7 @@ iavf_dev_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id) } txq = dev->data->tx_queues[tx_queue_id]; - txq->ops->release_mbufs(txq); + iavf_txq_release_mbufs_ops->release_mbufs(txq); reset_tx_queue(txq); dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; @@ -986,7 +989,7 @@ iavf_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid) if (!q) return; - q->ops->release_mbufs(q); + iavf_rxq_release_mbufs_ops->release_mbufs(q); rte_free(q->sw_ring); rte_memzone_free(q->mz); rte_free(q); @@ -1000,7 +1003,7 @@ iavf_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid) if (!q) return; - q->ops->release_mbufs(q); + iavf_txq_release_mbufs_ops->release_mbufs(q); rte_free(q->sw_ring); rte_memzone_free(q->mz); rte_free(q); @@ -1034,7 +1037,7 @@ iavf_stop_queues(struct rte_eth_dev *dev) txq = dev->data->tx_queues[i]; if (!txq) continue; - txq->ops->release_mbufs(txq); + iavf_txq_release_mbufs_ops->release_mbufs(txq); reset_tx_queue(txq); dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; } @@ -1042,7 +1045,7 @@ iavf_stop_queues(struct rte_eth_dev *dev) rxq = dev->data->rx_queues[i]; if (!rxq) continue; - rxq->ops->release_mbufs(rxq); + iavf_rxq_release_mbufs_ops->release_mbufs(rxq); reset_rx_queue(rxq); dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; } @@ -2825,7 +2828,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev) for (i = 0; i < dev->data->nb_rx_queues; i++) { rxq = dev->data->rx_queues[i]; - (void)iavf_rxq_vec_setup(rxq); + (void)iavf_rxq_vec_setup(rxq, &iavf_rxq_release_mbufs_ops); } if (dev->data->scattered_rx) { @@ -3008,11 +3011,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev) continue; #ifdef CC_AVX512_SUPPORT if (use_avx512) - iavf_txq_vec_setup_avx512(txq); + iavf_txq_vec_setup_avx512(&iavf_txq_release_mbufs_ops); else - iavf_txq_vec_setup(txq); + iavf_txq_vec_setup(&iavf_txq_release_mbufs_ops); #else - iavf_txq_vec_setup(txq); +
[PATCH] net/iavf: fix iavf crashed on dev_stop when running in multi-process mode
In the multi process environment, the sub process operates on the shared memory and changes the function pointer of the main process, resulting in the failure to find the address of the function when main process releasing, resulting in crash. similar with commit<20b631efe785819eb77aabbf500b3352e5731bdb> Signed-off-by: Ke Zhang --- drivers/net/iavf/iavf_rxtx.c| 27 ++--- drivers/net/iavf/iavf_rxtx.h| 6 +++--- drivers/net/iavf/iavf_rxtx_vec_avx512.c | 4 ++-- drivers/net/iavf/iavf_rxtx_vec_sse.c| 8 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 16e8d021f9..197c03cd31 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -362,6 +362,9 @@ release_txq_mbufs(struct iavf_tx_queue *txq) } } +const struct iavf_rxq_ops *iavf_rxq_release_mbufs_ops; +const struct iavf_txq_ops *iavf_txq_release_mbufs_ops; + static const struct iavf_rxq_ops def_rxq_ops = { .release_mbufs = release_rxq_mbufs, }; @@ -674,7 +677,7 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, rxq->q_set = true; dev->data->rx_queues[queue_idx] = rxq; rxq->qrx_tail = hw->hw_addr + IAVF_QRX_TAIL1(rxq->queue_id); - rxq->ops = &def_rxq_ops; + iavf_rxq_release_mbufs_ops = &def_rxq_ops; if (check_rx_bulk_allow(rxq) == true) { PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions are " @@ -811,7 +814,7 @@ iavf_dev_tx_queue_setup(struct rte_eth_dev *dev, txq->q_set = true; dev->data->tx_queues[queue_idx] = txq; txq->qtx_tail = hw->hw_addr + IAVF_QTX_TAIL1(queue_idx); - txq->ops = &def_txq_ops; + iavf_txq_release_mbufs_ops = &def_txq_ops; if (check_tx_vec_allow(txq) == false) { struct iavf_adapter *ad = @@ -943,7 +946,7 @@ iavf_dev_rx_queue_stop(struct rte_eth_dev *dev, uint16_t rx_queue_id) } rxq = dev->data->rx_queues[rx_queue_id]; - rxq->ops->release_mbufs(rxq); + iavf_rxq_release_mbufs_ops->release_mbufs(rxq); reset_rx_queue(rxq); dev->data->rx_queue_state[rx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; @@ -971,7 +974,7 @@ iavf_dev_tx_queue_stop(struct rte_eth_dev *dev, uint16_t tx_queue_id) } txq = dev->data->tx_queues[tx_queue_id]; - txq->ops->release_mbufs(txq); + iavf_txq_release_mbufs_ops->release_mbufs(txq); reset_tx_queue(txq); dev->data->tx_queue_state[tx_queue_id] = RTE_ETH_QUEUE_STATE_STOPPED; @@ -986,7 +989,7 @@ iavf_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid) if (!q) return; - q->ops->release_mbufs(q); + iavf_rxq_release_mbufs_ops->release_mbufs(q); rte_free(q->sw_ring); rte_memzone_free(q->mz); rte_free(q); @@ -1000,7 +1003,7 @@ iavf_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid) if (!q) return; - q->ops->release_mbufs(q); + iavf_txq_release_mbufs_ops->release_mbufs(q); rte_free(q->sw_ring); rte_memzone_free(q->mz); rte_free(q); @@ -1034,7 +1037,7 @@ iavf_stop_queues(struct rte_eth_dev *dev) txq = dev->data->tx_queues[i]; if (!txq) continue; - txq->ops->release_mbufs(txq); + iavf_txq_release_mbufs_ops->release_mbufs(txq); reset_tx_queue(txq); dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; } @@ -1042,7 +1045,7 @@ iavf_stop_queues(struct rte_eth_dev *dev) rxq = dev->data->rx_queues[i]; if (!rxq) continue; - rxq->ops->release_mbufs(rxq); + iavf_rxq_release_mbufs_ops->release_mbufs(rxq); reset_rx_queue(rxq); dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED; } @@ -2825,7 +2828,7 @@ iavf_set_rx_function(struct rte_eth_dev *dev) for (i = 0; i < dev->data->nb_rx_queues; i++) { rxq = dev->data->rx_queues[i]; - (void)iavf_rxq_vec_setup(rxq); + (void)iavf_rxq_vec_setup(rxq, &iavf_rxq_release_mbufs_ops); } if (dev->data->scattered_rx) { @@ -3008,11 +3011,11 @@ iavf_set_tx_function(struct rte_eth_dev *dev) continue; #ifdef CC_AVX512_SUPPORT if (use_avx512) - iavf_txq_vec_setup_avx512(txq); + iavf_txq_vec_setup_avx512(&iavf_txq_release_mbufs_ops); else - iavf_txq_vec_setup(txq); + iavf_txq_vec_setup(&iavf_txq_release_mbufs_ops); #else - iavf_txq_vec_setup(txq); +
RE: [PATCH v2] eal: add seqlock
> From: Stephen Hemminger [mailto:step...@networkplumber.org] > Sent: Saturday, 2 April 2022 02.54 > > Semantics and naming should be the same as Linux kernel or you risk > having to reeducate too many people. Although I do see significant value in that point, I don't consider the Linux kernel API the definitive golden standard in all regards. If DPDK can do better, it should. However, if different naming/semantics does a better job for DPDK, then we should take care to avoid similar function names with different behavior than Linux, to reduce the risk of incorrect use by seasoned Linux kernel developers.
RE: [PATCH v3] eal: add seqlock
> From: Honnappa Nagarahalli [mailto:honnappa.nagaraha...@arm.com] > Sent: Saturday, 2 April 2022 02.22 > > > From: Mattias Rönnblom > > Sent: Friday, April 1, 2022 10:08 AM > > > > diff --git a/lib/eal/include/rte_seqlock.h > > b/lib/eal/include/rte_seqlock.h new file > Other lock implementations are in lib/eal/include/generic. I'm not sure why what goes where... e.g. rte_branch_prediction.h and rte_bitops.h are not in include/generic. But I agree that keeping lock implementations in the same location makes sense. Also, other lock implementations have their init() function in their header file, so you could consider getting rid of the C file. I don't care, just mentioning it. > > +/** > > + * The RTE seqlock type. > > + */ > > +typedef struct { > > + uint32_t sn; /**< A sequence number for the protected data. */ > > + rte_spinlock_t lock; /**< Spinlock used to serialize writers. */ > } > Suggest using ticket lock for the writer side. It should have low > overhead when there is a single writer, but provides better > functionality when there are multiple writers. A spinlock and a ticket lock have the same size, so there is no memory cost either. Unless using a ticket lock stands in the way for future extensions to the seqlock library, then it seems like a good idea. > > +__rte_experimental > > +static inline bool > > +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t > > +*begin_sn) { Did anyone object to adding the __attribute__((warn_unused_result))? Otherwise, I think you should add it.
[v4 1/3] ethdev: introduce protocol type based header split
From: Xuan Ding Header split consists of splitting a received packet into two separate regions based on the packet content. The split happens after the packet header and before the packet payload. Splitting is usually between the packet header that can be posted to a dedicated buffer and the packet payload that can be posted to a different buffer. Currently, Rx buffer split supports length and offset based packet split. Although header split is a subset of buffer split, configuring buffer split based on length is not suitable for NICs that do split based on header protocol types. Because tunneling makes the conversion from length to protocol type impossible. This patch extends the current buffer split to support protocol type and offset based header split. A new proto field is introduced in the rte_eth_rxseg_split structure reserved field to specify header protocol type. With Rx offload flag RTE_ETH_RX_OFFLOAD_HEADER_SPLIT enabled and protocol type configured, PMD will split the ingress packets into two separate regions. Currently, both inner and outer L2/L3/L4 level header split can be supported. For example, let's suppose we configured the Rx queue with the following segments: seg0 - pool0, off0=2B seg1 - pool1, off1=128B With header split type configured with RTE_ETH_RX_HEADER_SPLIT_UDP, the packet consists of MAC_IP_UDP_PAYLOAD will be split like following: seg0 - udp header @ RTE_PKTMBUF_HEADROOM + 2 in mbuf from pool0 seg1 - payload @ 128 in mbuf from pool1 The memory attributes for the split parts may differ either - for example the mempool0 and mempool1 belong to dpdk memory and external memory, respectively. Signed-off-by: Xuan Ding Signed-off-by: Yuan Wang Signed-off-by: Wenxuan Wu Reviewed-by: Qi Zhang --- lib/ethdev/rte_ethdev.c | 34 ++--- lib/ethdev/rte_ethdev.h | 48 +++-- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c index 29a3d80466..29adcdc2f0 100644 --- a/lib/ethdev/rte_ethdev.c +++ b/lib/ethdev/rte_ethdev.c @@ -1661,6 +1661,7 @@ rte_eth_rx_queue_check_split(const struct rte_eth_rxseg_split *rx_seg, struct rte_mempool *mpl = rx_seg[seg_idx].mp; uint32_t length = rx_seg[seg_idx].length; uint32_t offset = rx_seg[seg_idx].offset; + uint16_t proto = rx_seg[seg_idx].proto; if (mpl == NULL) { RTE_ETHDEV_LOG(ERR, "null mempool pointer\n"); @@ -1694,13 +1695,29 @@ rte_eth_rx_queue_check_split(const struct rte_eth_rxseg_split *rx_seg, } offset += seg_idx != 0 ? 0 : RTE_PKTMBUF_HEADROOM; *mbp_buf_size = rte_pktmbuf_data_room_size(mpl); - length = length != 0 ? length : *mbp_buf_size; - if (*mbp_buf_size < length + offset) { - RTE_ETHDEV_LOG(ERR, - "%s mbuf_data_room_size %u < %u (segment length=%u + segment offset=%u)\n", - mpl->name, *mbp_buf_size, - length + offset, length, offset); - return -EINVAL; + if (proto == RTE_ETH_RX_HEADER_SPLIT_NONE) { + /* Check buffer split. */ + length = length != 0 ? length : *mbp_buf_size; + if (*mbp_buf_size < length + offset) { + RTE_ETHDEV_LOG(ERR, + "%s mbuf_data_room_size %u < %u (segment length=%u + segment offset=%u)\n", + mpl->name, *mbp_buf_size, + length + offset, length, offset); + return -EINVAL; + } + } else { + /* Check header split. */ + if (length != 0) { + RTE_ETHDEV_LOG(ERR, "segment length should be set to zero in header split\n"); + return -EINVAL; + } + if (*mbp_buf_size < offset) { + RTE_ETHDEV_LOG(ERR, + "%s mbuf_data_room_size %u < %u segment offset)\n", + mpl->name, *mbp_buf_size, + offset); + return -EINVAL; + } } } return 0; @@ -1778,7 +1795,8 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id, rx_seg = (const struct rte_eth_rxseg_split *)rx_conf->rx_seg; n_seg = rx_conf->rx_nseg; - if (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT) { + if (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT || +
[v4 0/3] ethdev: introduce protocol type based header split
From: Xuan Ding Header split consists of splitting a received packet into two separate regions based on the packet content. It is useful in some scenarios, such as GPU acceleration. The splitting will help to enable true zero copy and hence improve the performance significantly. This patchset extends the current buffer split to support protocol based header split. When Rx queue is configured with header split feature, packets received will be directly splitted into two different mempools. v3->v4: * Use RTE_ETH_RX_HEADER_SPLIT_NONE instead of 0. v2->v3: * Fix a PMD bug. * Add rx queue header split check. * Revise the log and doc. v1->v2: * Add support for all header split protocol types. Xuan Ding (3): ethdev: introduce protocol type based header split app/testpmd: add header split configuration net/ice: support header split in Rx data path app/test-pmd/cmdline.c| 117 ++ app/test-pmd/testpmd.c| 6 +- app/test-pmd/testpmd.h| 2 + drivers/net/ice/ice_ethdev.c | 10 +- drivers/net/ice/ice_rxtx.c| 223 ++ drivers/net/ice/ice_rxtx.h| 16 ++ drivers/net/ice/ice_rxtx_vec_common.h | 3 + lib/ethdev/rte_ethdev.c | 34 +++- lib/ethdev/rte_ethdev.h | 48 +- 9 files changed, 417 insertions(+), 42 deletions(-) -- 2.17.1
[v4 2/3] app/testpmd: add header split configuration
From: Xuan Ding This patch adds header split configuration in testpmd. The header split feature is off by default. To enable header split, you need: 1. Configure Rx queue with rx_offload header split on. 2. Set the protocol type of header split. Command for set header split protocol type: testpmd> port config header_split mac|ipv4|ipv6|l3|tcp|udp|sctp| l4|inner_mac|inner_ipv4|inner_ipv6|inner_l3|inner_tcp| inner_udp|inner_sctp|inner_l4 Signed-off-by: Xuan Ding Signed-off-by: Yuan Wang Signed-off-by: Wenxuan Wu --- app/test-pmd/cmdline.c | 117 + app/test-pmd/testpmd.c | 6 ++- app/test-pmd/testpmd.h | 2 + 3 files changed, 124 insertions(+), 1 deletion(-) diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c index 6ffea8e21a..abda81b4bc 100644 --- a/app/test-pmd/cmdline.c +++ b/app/test-pmd/cmdline.c @@ -866,6 +866,12 @@ static void cmd_help_long_parsed(void *parsed_result, " Enable or disable a per port Rx offloading" " on all Rx queues of a port\n\n" + "port config header_split mac|ipv4|ipv6|l3|tcp|udp|sctp|l4|" + "inner_mac|inner_ipv4|inner_ipv6|inner_l3|inner_tcp|" + "inner_udp|inner_sctp|inner_l4\n" + " Configure protocol for header split" + " on all Rx queues of a port\n\n" + "port (port_id) rxq (queue_id) rx_offload vlan_strip|" "ipv4_cksum|udp_cksum|tcp_cksum|tcp_lro|qinq_strip|" "outer_ipv4_cksum|macsec_strip|header_split|" @@ -16353,6 +16359,116 @@ cmdline_parse_inst_t cmd_config_per_port_rx_offload = { } }; +/* config a per port header split protocol */ +struct cmd_config_per_port_headersplit_protocol_result { + cmdline_fixed_string_t port; + cmdline_fixed_string_t config; + uint16_t port_id; + cmdline_fixed_string_t headersplit; + cmdline_fixed_string_t protocol; +}; + +cmdline_parse_token_string_t cmd_config_per_port_headersplit_protocol_result_port = + TOKEN_STRING_INITIALIZER + (struct cmd_config_per_port_headersplit_protocol_result, +port, "port"); +cmdline_parse_token_string_t cmd_config_per_port_headersplit_protocol_result_config = + TOKEN_STRING_INITIALIZER + (struct cmd_config_per_port_headersplit_protocol_result, +config, "config"); +cmdline_parse_token_num_t cmd_config_per_port_headersplit_protocol_result_port_id = + TOKEN_NUM_INITIALIZER + (struct cmd_config_per_port_headersplit_protocol_result, +port_id, RTE_UINT16); +cmdline_parse_token_string_t cmd_config_per_port_headersplit_protocol_result_headersplit = + TOKEN_STRING_INITIALIZER + (struct cmd_config_per_port_headersplit_protocol_result, +headersplit, "header_split"); +cmdline_parse_token_string_t cmd_config_per_port_headersplit_protocol_result_protocol = + TOKEN_STRING_INITIALIZER + (struct cmd_config_per_port_headersplit_protocol_result, +protocol, "mac#ipv4#ipv6#l3#tcp#udp#sctp#l4#" + "inner_mac#inner_ipv4#inner_ipv6#inner_l3#inner_tcp#" + "inner_udp#inner_sctp#inner_l4"); + +static void +cmd_config_per_port_headersplit_protocol_parsed(void *parsed_result, + __rte_unused struct cmdline *cl, + __rte_unused void *data) +{ + struct cmd_config_per_port_headersplit_protocol_result *res = parsed_result; + portid_t port_id = res->port_id; + struct rte_port *port = &ports[port_id]; + uint16_t protocol; + + if (port_id_is_invalid(port_id, ENABLED_WARN)) + return; + + if (port->port_status != RTE_PORT_STOPPED) { + fprintf(stderr, + "Error: Can't config offload when Port %d is not stopped\n", + port_id); + return; + } + + if (!strcmp(res->protocol, "mac")) + protocol = RTE_ETH_RX_HEADER_SPLIT_MAC; + else if (!strcmp(res->protocol, "ipv4")) + protocol = RTE_ETH_RX_HEADER_SPLIT_IPV4; + else if (!strcmp(res->protocol, "ipv6")) + protocol = RTE_ETH_RX_HEADER_SPLIT_IPV6; + else if (!strcmp(res->protocol, "l3")) + protocol = RTE_ETH_RX_HEADER_SPLIT_L3; + else if (!strcmp(res->protocol, "tcp")) + protocol = RTE_ETH_RX_HEADER_SPLIT_TCP; + else if (!strcmp(res->protocol, "udp")) + protocol = RTE_ETH_RX_HEADER_SPLIT_UDP; + else if (!strcmp(res->protocol, "sctp")) + protocol = RTE_ETH_RX_HEADER_SPLIT_SCTP; + else if (!strcmp(res->protocol, "l4")) + protocol = RTE_ETH_RX_HEADER_SPLIT_L4; + e
[v4 3/3] net/ice: support header split in Rx data path
From: Xuan Ding This patch adds support for header split in normal Rx data paths. When the Rx queue is configured with header split for specific protocol type, packets received will be directly splited into header and payload parts. And the two parts will be put into different mempools. Currently, header split is not supported in vectorized paths. Signed-off-by: Xuan Ding Signed-off-by: Yuan Wang Signed-off-by: Wenxuan Wu Reviewed-by: Qi Zhang --- drivers/net/ice/ice_ethdev.c | 10 +- drivers/net/ice/ice_rxtx.c| 223 ++ drivers/net/ice/ice_rxtx.h| 16 ++ drivers/net/ice/ice_rxtx_vec_common.h | 3 + 4 files changed, 221 insertions(+), 31 deletions(-) diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c index 13adcf90ed..cb32265dbe 100644 --- a/drivers/net/ice/ice_ethdev.c +++ b/drivers/net/ice/ice_ethdev.c @@ -3713,7 +3713,8 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) RTE_ETH_RX_OFFLOAD_OUTER_IPV4_CKSUM | RTE_ETH_RX_OFFLOAD_VLAN_EXTEND | RTE_ETH_RX_OFFLOAD_RSS_HASH | - RTE_ETH_RX_OFFLOAD_TIMESTAMP; + RTE_ETH_RX_OFFLOAD_TIMESTAMP | + RTE_ETH_RX_OFFLOAD_HEADER_SPLIT; dev_info->tx_offload_capa |= RTE_ETH_TX_OFFLOAD_QINQ_INSERT | RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | @@ -3725,7 +3726,7 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->flow_type_rss_offloads |= ICE_RSS_OFFLOAD_ALL; } - dev_info->rx_queue_offload_capa = 0; + dev_info->rx_queue_offload_capa = RTE_ETH_RX_OFFLOAD_HEADER_SPLIT; dev_info->tx_queue_offload_capa = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE; dev_info->reta_size = pf->hash_lut_size; @@ -3794,6 +3795,11 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) dev_info->default_rxportconf.ring_size = ICE_BUF_SIZE_MIN; dev_info->default_txportconf.ring_size = ICE_BUF_SIZE_MIN; + dev_info->rx_seg_capa.max_nseg = ICE_RX_MAX_NSEG; + dev_info->rx_seg_capa.multi_pools = 1; + dev_info->rx_seg_capa.offset_allowed = 0; + dev_info->rx_seg_capa.offset_align_log2 = 0; + return 0; } diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c index 041f4bc91f..1f245c853b 100644 --- a/drivers/net/ice/ice_rxtx.c +++ b/drivers/net/ice/ice_rxtx.c @@ -282,7 +282,6 @@ ice_program_hw_rx_queue(struct ice_rx_queue *rxq) /* Set buffer size as the head split is disabled. */ buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mp) - RTE_PKTMBUF_HEADROOM); - rxq->rx_hdr_len = 0; rxq->rx_buf_len = RTE_ALIGN(buf_size, (1 << ICE_RLAN_CTX_DBUF_S)); rxq->max_pkt_len = RTE_MIN((uint32_t)ICE_SUPPORT_CHAIN_NUM * rxq->rx_buf_len, @@ -311,11 +310,54 @@ ice_program_hw_rx_queue(struct ice_rx_queue *rxq) memset(&rx_ctx, 0, sizeof(rx_ctx)); + if (rxq->offloads & RTE_ETH_RX_OFFLOAD_HEADER_SPLIT) { + switch (rxq->rxseg[0].proto) { + case RTE_ETH_RX_HEADER_SPLIT_MAC: + rx_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT; + rx_ctx.hsplit_1 = ICE_RLAN_RX_HSPLIT_1_SPLIT_L2; + break; + case RTE_ETH_RX_HEADER_SPLIT_INNER_MAC: + rx_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT; + rx_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_SPLIT_L2; + break; + case RTE_ETH_RX_HEADER_SPLIT_IPV4: + case RTE_ETH_RX_HEADER_SPLIT_IPV6: + case RTE_ETH_RX_HEADER_SPLIT_L3: + case RTE_ETH_RX_HEADER_SPLIT_INNER_IPV4: + case RTE_ETH_RX_HEADER_SPLIT_INNER_IPV6: + case RTE_ETH_RX_HEADER_SPLIT_INNER_L3: + rx_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT; + rx_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_SPLIT_IP; + break; + case RTE_ETH_RX_HEADER_SPLIT_TCP: + case RTE_ETH_RX_HEADER_SPLIT_UDP: + case RTE_ETH_RX_HEADER_SPLIT_INNER_TCP: + case RTE_ETH_RX_HEADER_SPLIT_INNER_UDP: + rx_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT; + rx_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_SPLIT_TCP_UDP; + break; + case RTE_ETH_RX_HEADER_SPLIT_SCTP: + case RTE_ETH_RX_HEADER_SPLIT_INNER_SCTP: + rx_ctx.dtype = ICE_RX_DTYPE_HEADER_SPLIT; + rx_ctx.hsplit_0 = ICE_RLAN_RX_HSPLIT_0_SPLIT_SCTP; + break; + case RTE_ETH_RX_HEADER_SPLIT_NONE: + PMD_DRV_LOG(ERR, "Header split protocol must be confi
RE: [PATCH v2] examples/l3fwd: merge l3fwd-acl into l3fwd
Hi Sean, My comments inline. Thanks Konstantin > l3fwd-acl contains duplicate functions to l3fwd. > For this reason we merge l3fwd-acl code into l3fwd > with '--lookup acl' cmdline option to run ACL. > > Signed-off-by: Sean Morrissey > --- > V2: > * add doc changes > * minor code cleanup > --- > doc/guides/rel_notes/release_22_07.rst|4 + > doc/guides/sample_app_ug/index.rst|1 - > doc/guides/sample_app_ug/l3_forward.rst | 63 +- > .../sample_app_ug/l3_forward_access_ctrl.rst | 340 --- > examples/l3fwd-acl/Makefile | 51 - > examples/l3fwd-acl/main.c | 2272 - > examples/l3fwd-acl/meson.build| 13 - > examples/l3fwd/Makefile |2 +- > examples/l3fwd/l3fwd.h| 60 +- > examples/l3fwd/l3fwd_acl.c| 960 +++ > examples/l3fwd/l3fwd_acl.h| 118 + > examples/l3fwd/l3fwd_acl_scalar.h | 163 ++ > examples/l3fwd/l3fwd_route.h | 16 + > examples/l3fwd/main.c | 97 +- > examples/l3fwd/meson.build|3 +- > examples/meson.build |1 - > 16 files changed, 1466 insertions(+), 2698 deletions(-) > delete mode 100644 doc/guides/sample_app_ug/l3_forward_access_ctrl.rst > delete mode 100644 examples/l3fwd-acl/Makefile > delete mode 100644 examples/l3fwd-acl/main.c > delete mode 100644 examples/l3fwd-acl/meson.build > create mode 100644 examples/l3fwd/l3fwd_acl.c > create mode 100644 examples/l3fwd/l3fwd_acl.h > create mode 100644 examples/l3fwd/l3fwd_acl_scalar.h > ... > diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h > index ad39496e64..6503cc2f4e 100644 > --- a/examples/l3fwd/l3fwd.h > +++ b/examples/l3fwd/l3fwd.h > @@ -7,6 +7,7 @@ > > #include > #include > +#include > > #define DO_RFC_1812_CHECKS > > @@ -61,6 +62,41 @@ > struct parm_cfg { > const char *rule_ipv4_name; > const char *rule_ipv6_name; > + enum rte_acl_classify_alg alg; > +}; > + > +static const struct { > + const char *name; > + enum rte_acl_classify_alg alg; > +} acl_alg[] = { > + { > + .name = "scalar", > + .alg = RTE_ACL_CLASSIFY_SCALAR, > + }, > + { > + .name = "sse", > + .alg = RTE_ACL_CLASSIFY_SSE, > + }, > + { > + .name = "avx2", > + .alg = RTE_ACL_CLASSIFY_AVX2, > + }, > + { > + .name = "neon", > + .alg = RTE_ACL_CLASSIFY_NEON, > + }, > + { > + .name = "altivec", > + .alg = RTE_ACL_CLASSIFY_ALTIVEC, > + }, > + { > + .name = "avx512x16", > + .alg = RTE_ACL_CLASSIFY_AVX512X16, > + }, > + { > + .name = "avx512x32", > + .alg = RTE_ACL_CLASSIFY_AVX512X32, > + }, > }; I probably wasn't clear enough in my previous comments. I think there is no need to move definition of acl_alg[] into .h and making it static. Instead I .h we can just do: extern struct acl_algorithms acl_alg[]; and keep actual definition in .c. > > struct mbuf_table { > @@ -80,6 +116,7 @@ struct lcore_conf { > uint16_t tx_port_id[RTE_MAX_ETHPORTS]; > uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; > struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; > + struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS]; > void *ipv4_lookup_struct; > void *ipv6_lookup_struct; > } __rte_cache_aligned; > @@ -193,7 +230,10 @@ is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t > link_len) > int > init_mem(uint16_t portid, unsigned int nb_mbuf); > > -/* Function pointers for LPM, EM or FIB functionality. */ > +/* Function pointers for ACL, LPM, EM or FIB functionality. */ > +void > +setup_acl(const int socketid); > + > void > setup_lpm(const int socketid); > > @@ -203,12 +243,19 @@ setup_hash(const int socketid); > void > setup_fib(const int socketid); > > +int > +acl_check_ptype(int portid); > + > int > em_check_ptype(int portid); > > int > lpm_check_ptype(int portid); > > +uint16_t > +acl_cb_parse_ptype(uint16_t port, uint16_t queue, struct rte_mbuf *pkts[], > + uint16_t nb_pkts, uint16_t max_pkts, void *user_param); > + > uint16_t > em_cb_parse_ptype(uint16_t port, uint16_t queue, struct rte_mbuf *pkts[], > uint16_t nb_pkts, uint16_t max_pkts, void *user_param); > @@ -217,6 +264,9 @@ uint16_t > lpm_cb_parse_ptype(uint16_t port, uint16_t queue, struct rte_mbuf *pkts[], > uint16_t nb_pkts, uint16_t max_pkts, void *user_param); > > +int > +acl_main_loop(__rte_unused void *dummy); > + > int > em_main_loop(__rte_unused void *dummy); > > @@ -278,7 +328,13 @@ int > fib_event_main_loop_tx_q_burst_vector(__rte_unused void *dummy); > > > -/* Return ipv4/ipv6 fwd lookup struct for LPM, EM or FIB. */ > +/* Return ipv4/ipv6
Re: [PATCH v2] eal: add seqlock
On 4/2/22 12:25, Morten Brørup wrote: From: Stephen Hemminger [mailto:step...@networkplumber.org] Sent: Saturday, 2 April 2022 02.54 Semantics and naming should be the same as Linux kernel or you risk having to reeducate too many people. Although I do see significant value in that point, I don't consider the Linux kernel API the definitive golden standard in all regards. If DPDK can do better, it should. However, if different naming/semantics does a better job for DPDK, then we should take care to avoid similar function names with different behavior than Linux, to reduce the risk of incorrect use by seasoned Linux kernel developers. Couldn't have said it better myself.
Re: [PATCH v2] eal: add seqlock
On 4/2/22 02:50, Stephen Hemminger wrote: On Wed, 30 Mar 2022 16:26:02 +0200 Mattias Rönnblom wrote: + __atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELAXED); + + /* __ATOMIC_RELEASE to prevent stores after (in program order) +* from happening before the sn store. +*/ + rte_atomic_thread_fence(__ATOMIC_RELEASE); Couldn't atomic store with __ATOMIC_RELEASE do same thing? No, store-release wouldn't prevent later stores from moving up. It only ensures that earlier loads and stores have completed before store-release completes. If later stores could move before a supposed store-release(seqlock->sn), readers could see inconsistent (torn) data with a valid sequence number. +static inline void +rte_seqlock_write_end(rte_seqlock_t *seqlock) +{ + uint32_t sn; + + sn = seqlock->sn + 1; + + /* synchronizes-with the load acquire in rte_seqlock_begin() */ + __atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELEASE); + + rte_spinlock_unlock(&seqlock->lock); Atomic store is not necessary here, the atomic operation in spinlock_unlock wil assure theat the seqeuence number update is ordered correctly. Load-acquire(seqlock->sn) in rte_seqlock_begin() must be paired with store-release(seqlock->sn) in rte_seqlock_write_end() or there wouldn't exist any synchronize-with relationship. Readers don't access the spin lock so any writer-side updates to the spin lock don't mean anything to readers.
Re: [PATCH v3] eal: add seqlock
On 4/1/22 17:07, Mattias Rönnblom wrote: + +/** + * End a read-side critical section. + * + * A call to this function marks the end of a read-side critical + * section, for @p seqlock. The application must supply the sequence + * number produced by the corresponding rte_seqlock_read_lock() (or, + * in case of a retry, the rte_seqlock_tryunlock()) call. + * + * After this function has been called, the caller should not access + * the protected data. + * + * In case this function returns true, the just-read data was + * consistent and the set of atomic and non-atomic load operations + * performed between rte_seqlock_read_lock() and + * rte_seqlock_read_tryunlock() were atomic, as a whole. + * + * In case rte_seqlock_read_tryunlock() returns false, the data was + * modified as it was being read and may be inconsistent, and thus + * should be discarded. The @p begin_sn is updated with the + * now-current sequence number. + * + * @param seqlock + * A pointer to the seqlock. + * @param begin_sn + * The seqlock sequence number returned by + * rte_seqlock_read_lock() (potentially updated in subsequent + * rte_seqlock_read_tryunlock() calls) for this critical section. + * @return + * true or false, if the just-read seqlock-protected data was consistent + * or inconsistent, respectively, at the time it was read. + * + * @see rte_seqlock_read_lock() + */ +__rte_experimental +static inline bool +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t *begin_sn) +{ + uint32_t end_sn; + + /* make sure the data loads happens before the sn load */ + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); + + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED); Since we are reading and potentially returning the sequence number here (repeating the read of the protected data), we need to use load-acquire. I assume it is not expected that the user will call rte_seqlock_read_lock() again. Seeing this implementation, I might actually prefer the original implementation, I think it is cleaner. But I would like for the begin function also to wait for an even sequence number, the end function would only have to check for same sequence number, this might improve performance a little bit as readers won't perform one or several broken reads while a write is in progress. The function names are a different thing though. The writer side behaves much more like a lock with mutual exclusion so write_lock/write_unlock makes sense. + + if (unlikely(end_sn & 1 || *begin_sn != end_sn)) { + *begin_sn = end_sn; + return false; + } + + return true; +} +
RE: [PATCH v3] eal: add seqlock
> > +__rte_experimental > > +static inline bool > > +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t > > +*begin_sn) { > > + uint32_t end_sn; > > + > > + /* make sure the data loads happens before the sn load */ > > + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); > > + > > + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED); > > Since we are reading and potentially returning the sequence number here > (repeating the read of the protected data), we need to use load-acquire. > I assume it is not expected that the user will call > rte_seqlock_read_lock() again. Good point, we need a load-acquire (due to changes done in v3). > > Seeing this implementation, I might actually prefer the original > implementation, I think it is cleaner. But I would like for the begin function > also to wait for an even sequence number, the end function would only have > to check for same sequence number, this might improve performance a little > bit as readers won't perform one or several broken reads while a write is in > progress. The function names are a different thing though. I think we need to be optimizing for the case where there is no contention between readers and writers (as that happens most of the time). From this perspective, not checking for an even seq number in the begin function would reduce one 'if' statement. Going back to the earlier model is better as well, because of the load-acquire required in the 'rte_seqlock_read_tryunlock' function. The earlier model would not introduce the load-acquire for the no contention case. > > The writer side behaves much more like a lock with mutual exclusion so > write_lock/write_unlock makes sense. > > > + > > + if (unlikely(end_sn & 1 || *begin_sn != end_sn)) { > > + *begin_sn = end_sn; > > + return false; > > + } > > + > > + return true; > > +} > > +
RE: [PATCH v2] eal: add seqlock
> >> +static inline void > >> +rte_seqlock_write_end(rte_seqlock_t *seqlock) { > >> + uint32_t sn; > >> + > >> + sn = seqlock->sn + 1; > >> + > >> + /* synchronizes-with the load acquire in rte_seqlock_begin() > >> */ > >> + __atomic_store_n(&seqlock->sn, sn, __ATOMIC_RELEASE); > >> + > >> + rte_spinlock_unlock(&seqlock->lock); > > Atomic store is not necessary here, the atomic operation in > > spinlock_unlock wil assure theat the seqeuence number update is > > ordered correctly. > Load-acquire(seqlock->sn) in rte_seqlock_begin() must be paired with > store-release(seqlock->sn) in rte_seqlock_write_end() or there wouldn't exist > any synchronize-with relationship. Readers don't access the spin lock so any > writer-side updates to the spin lock don't mean anything to readers. Agree with this assessment. The store-release in spin-lock unlock does not synchronize with the readers.
RE: [PATCH v3] eal: add seqlock
> > > > +__rte_experimental > > > +static inline bool > > > +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t > > > +*begin_sn) { > > Did anyone object to adding the __attribute__((warn_unused_result))? > > Otherwise, I think you should add it. +1
RE: [PATCH v3] eal: add seqlock
> From: Honnappa Nagarahalli [mailto:honnappa.nagaraha...@arm.com] > Sent: Saturday, 2 April 2022 21.31 > > > > > > +__rte_experimental > > > +static inline bool > > > +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t > > > +*begin_sn) { > > > + uint32_t end_sn; > > > + > > > + /* make sure the data loads happens before the sn load */ > > > + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); > > > + > > > + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED); > > > > Since we are reading and potentially returning the sequence number > here > > (repeating the read of the protected data), we need to use load- > acquire. > > I assume it is not expected that the user will call > > rte_seqlock_read_lock() again. > Good point, we need a load-acquire (due to changes done in v3). > > > > > Seeing this implementation, I might actually prefer the original > > implementation, I think it is cleaner. But I would like for the begin > function > > also to wait for an even sequence number, the end function would only > have > > to check for same sequence number, this might improve performance a > little > > bit as readers won't perform one or several broken reads while a > write is in > > progress. The function names are a different thing though. > I think we need to be optimizing for the case where there is no > contention between readers and writers (as that happens most of the > time). From this perspective, not checking for an even seq number in > the begin function would reduce one 'if' statement. I might be siding with Ola on this, but with a twist: The read_lock() should not wait, but test. (Or both variants could be available. Or all three, including the variant without checking for an even sequence number.) My argument for this is: The write operation could take a long time to complete, and while this goes on, it is good for the reading threads to know at entry of their critical read section that the read operation will fail, so they can take the alternative code path instead of proceeding into the critical read section. Otherwise, the reading threads have to waste time reading the protected data, only to discard them at the end. It's an optimization based on the assumption that reading the protected data has some small cost, because this small cost adds up if done many times during a longwinded write operation. And, although checking for the sequence number in read_trylock() adds an 'if' statement to it, that 'if' statement should be surrounded by likely() to reduce its cost in the case we are optimizing for, i.e. when no write operation is ongoing. This means that read_trylock() returns a boolean, and the sequence number is returned in an output parameter. Please note that it doesn't change the fact that read_tryunlock() can still fail, even though read_trylock() gave the go-ahead. I'm trying to highlight that while we all agree to optimize for the case of reading while no writing is ongoing, there might be opportunity for optimizing for the opposite case (i.e. trying to read while writing is ongoing) at the same time. I only hope it can be done with negligent performance cost for the primary case. I'll respectfully leave the hardcore implementation details and performance considerations to you experts in this area. :-)
RE: [PATCH v3] eal: add seqlock
> > > > > > +__rte_experimental > > > > +static inline bool > > > > +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t > > > > +*begin_sn) { > > > > + uint32_t end_sn; > > > > + > > > > + /* make sure the data loads happens before the sn load */ > > > > + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); > > > > + > > > > + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED); > > > > > > Since we are reading and potentially returning the sequence number > > here > > > (repeating the read of the protected data), we need to use load- > > acquire. > > > I assume it is not expected that the user will call > > > rte_seqlock_read_lock() again. > > Good point, we need a load-acquire (due to changes done in v3). > > > > > > > > Seeing this implementation, I might actually prefer the original > > > implementation, I think it is cleaner. But I would like for the > > > begin > > function > > > also to wait for an even sequence number, the end function would > > > only > > have > > > to check for same sequence number, this might improve performance a > > little > > > bit as readers won't perform one or several broken reads while a > > write is in > > > progress. The function names are a different thing though. > > I think we need to be optimizing for the case where there is no > > contention between readers and writers (as that happens most of the > > time). From this perspective, not checking for an even seq number in > > the begin function would reduce one 'if' statement. > > I might be siding with Ola on this, but with a twist: The read_lock() should > not > wait, but test. (Or both variants could be available. Or all three, including > the > variant without checking for an even sequence number.) > > My argument for this is: The write operation could take a long time to > complete, and while this goes on, it is good for the reading threads to know > at > entry of their critical read section that the read operation will fail, so > they can > take the alternative code path instead of proceeding into the critical read > section. Otherwise, the reading threads have to waste time reading the > protected data, only to discard them at the end. It's an optimization based on > the assumption that reading the protected data has some small cost, because > this small cost adds up if done many times during a longwinded write > operation. > > And, although checking for the sequence number in read_trylock() adds an 'if' > statement to it, that 'if' statement should be surrounded by likely() to > reduce > its cost in the case we are optimizing for, i.e. when no write operation is > ongoing. This 'if' statement can be part of the application code as well. This would allow for multiple models to exist. > > This means that read_trylock() returns a boolean, and the sequence number is > returned in an output parameter. > > Please note that it doesn't change the fact that read_tryunlock() can still > fail, > even though read_trylock() gave the go-ahead. > > I'm trying to highlight that while we all agree to optimize for the case of > reading while no writing is ongoing, there might be opportunity for optimizing > for the opposite case (i.e. trying to read while writing is ongoing) at the > same > time. > > I only hope it can be done with negligent performance cost for the primary > case. > > I'll respectfully leave the hardcore implementation details and performance > considerations to you experts in this area. :-)
RE: [PATCH] net/iavf: fix iavf crashed on dev_stop when running in multi-process mode
> -Original Message- > From: Ke Zhang > Sent: Saturday, April 2, 2022 5:51 PM > To: Li, Xiaoyun ; Wu, Jingjing ; > Xing, Beilei ; dev@dpdk.org > Cc: Zhang, Ke1X > Subject: [PATCH] net/iavf: fix iavf crashed on dev_stop when running in > multi-process mode > > In the multi process environment, the sub process operates on the shared > memory and changes the function pointer of the main process, resulting in > the failure to find the address of the function when main process releasing, > resulting in crash. > > similar with commit<20b631efe785819eb77aabbf500b3352e5731bdb> > > Signed-off-by: Ke Zhang > --- > drivers/net/iavf/iavf_rxtx.c| 27 ++--- > drivers/net/iavf/iavf_rxtx.h| 6 +++--- > drivers/net/iavf/iavf_rxtx_vec_avx512.c | 4 ++-- > drivers/net/iavf/iavf_rxtx_vec_sse.c| 8 > 4 files changed, 24 insertions(+), 21 deletions(-) > > diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index > 16e8d021f9..197c03cd31 100644 > --- a/drivers/net/iavf/iavf_rxtx.c > +++ b/drivers/net/iavf/iavf_rxtx.c > @@ -362,6 +362,9 @@ release_txq_mbufs(struct iavf_tx_queue *txq) > } > } > > +const struct iavf_rxq_ops *iavf_rxq_release_mbufs_ops; const struct > +iavf_txq_ops *iavf_txq_release_mbufs_ops; > + > static const struct iavf_rxq_ops def_rxq_ops = { > .release_mbufs = release_rxq_mbufs, > }; > @@ -674,7 +677,7 @@ iavf_dev_rx_queue_setup(struct rte_eth_dev *dev, > uint16_t queue_idx, > rxq->q_set = true; > dev->data->rx_queues[queue_idx] = rxq; > rxq->qrx_tail = hw->hw_addr + IAVF_QRX_TAIL1(rxq->queue_id); > - rxq->ops = &def_rxq_ops; > + iavf_rxq_release_mbufs_ops = &def_rxq_ops; This is not correct. Now we replace per-queue ops with a global ops which is not expected. Please reference method of below patch commit 0ed16e01313e1f8930dc6a52b22159b20269d4e0 Author: Steve Yang Date: Mon Feb 28 09:48:59 2022 + net/iavf: fix function pointer in multi-process ... > > diff --git a/drivers/net/iavf/iavf_rxtx_vec_sse.c > b/drivers/net/iavf/iavf_rxtx_vec_sse.c > index 717a227b2c..a782bed2e0 100644 > --- a/drivers/net/iavf/iavf_rxtx_vec_sse.c > +++ b/drivers/net/iavf/iavf_rxtx_vec_sse.c > @@ -1219,16 +1219,16 @@ static const struct iavf_txq_ops sse_vec_txq_ops > = { }; > > int __rte_cold > -iavf_txq_vec_setup(struct iavf_tx_queue *txq) > +iavf_txq_vec_setup(const struct iavf_txq_ops **txq_ops) > { > - txq->ops = &sse_vec_txq_ops; > + *txq_ops = &sse_vec_txq_ops; > return 0; > } > > int __rte_cold > -iavf_rxq_vec_setup(struct iavf_rx_queue *rxq) > +iavf_rxq_vec_setup(struct iavf_rx_queue *rxq, const struct iavf_rxq_ops > +**rxq_ops) > { > - rxq->ops = &sse_vec_rxq_ops; > + *rxq_ops = &sse_vec_rxq_ops; > return iavf_rxq_vec_setup_default(rxq); } seems lots of redundant in iavf_rxtx_vec.sse.c Can we move iavf_r(t)xq_vec_setup | sse_vec_r(t)xq_ops into iavf_rxtx.c and delete iavf_r(t)x_queue_release_mbufs_sse)? Btw, We can keep this patch unchanged, a separated patch to refact the code is expected. > > -- > 2.25.1
Re: [PATCH v3] eal: add seqlock
On 2022-04-02 02:21, Honnappa Nagarahalli wrote: Hi Mattias, Few comments inline. -Original Message- From: Mattias Rönnblom Sent: Friday, April 1, 2022 10:08 AM To: dev@dpdk.org Cc: tho...@monjalon.net; David Marchand ; onar.ol...@ericsson.com; Honnappa Nagarahalli ; nd ; konstantin.anan...@intel.com; m...@smartsharesystems.com; step...@networkplumber.org; Mattias Rönnblom ; Ola Liljedahl Subject: [PATCH v3] eal: add seqlock A sequence lock (seqlock) is synchronization primitive which allows for data- race free, low-overhead, high-frequency reads, especially for data structures shared across many cores and which are updated with relatively infrequently. A seqlock permits multiple parallel readers. The variant of seqlock implemented in this patch supports multiple writers as well. A spinlock is used for writer- writer serialization. To avoid resource reclamation and other issues, the data protected by a seqlock is best off being self-contained (i.e., no pointers [except to constant data]). One way to think about seqlocks is that they provide means to perform atomic operations on data objects larger what the native atomic machine instructions allow for. DPDK seqlocks are not preemption safe on the writer side. A thread preemption affects performance, not correctness. A seqlock contains a sequence number, which can be thought of as the generation of the data it protects. A reader will 1. Load the sequence number (sn). 2. Load, in arbitrary order, the seqlock-protected data. 3. Load the sn again. 4. Check if the first and second sn are equal, and even numbered. If they are not, discard the loaded data, and restart from 1. The first three steps need to be ordered using suitable memory fences. A writer will 1. Take the spinlock, to serialize writer access. 2. Load the sn. 3. Store the original sn + 1 as the new sn. 4. Perform load and stores to the seqlock-protected data. 5. Store the original sn + 2 as the new sn. 6. Release the spinlock. Proper memory fencing is required to make sure the first sn store, the data stores, and the second sn store appear to the reader in the mentioned order. The sn loads and stores must be atomic, but the data loads and stores need not be. The original seqlock design and implementation was done by Stephen Hemminger. This is an independent implementation, using C11 atomics. For more information on seqlocks, see https://en.wikipedia.org/wiki/Seqlock PATCH v3: * Renamed both read and write-side critical section begin/end functions to better match rwlock naming, per Ola Liljedahl's suggestion. * Added 'extern "C"' guards for C++ compatibility. * Refer to the main lcore as the main, and not the master. PATCH v2: * Skip instead of fail unit test in case too few lcores are available. * Use main lcore for testing, reducing the minimum number of lcores required to run the unit tests to four. * Consistently refer to sn field as the "sequence number" in the documentation. * Fixed spelling mistakes in documentation. Updates since RFC: * Added API documentation. * Added link to Wikipedia article in the commit message. * Changed seqlock sequence number field from uint64_t (which was overkill) to uint32_t. The sn type needs to be sufficiently large to assure no reader will read a sn, access the data, and then read the same sn, but the sn has been updated to many times during the read, so it has wrapped. * Added RTE_SEQLOCK_INITIALIZER macro for static initialization. * Removed the rte_seqlock struct + separate rte_seqlock_t typedef with an anonymous struct typedef:ed to rte_seqlock_t. Acked-by: Morten Brørup Reviewed-by: Ola Liljedahl Signed-off-by: Mattias Rönnblom --- app/test/meson.build | 2 + app/test/test_seqlock.c | 202 +++ lib/eal/common/meson.build| 1 + lib/eal/common/rte_seqlock.c | 12 ++ lib/eal/include/meson.build | 1 + lib/eal/include/rte_seqlock.h | 302 ++ lib/eal/version.map | 3 + 7 files changed, 523 insertions(+) create mode 100644 app/test/test_seqlock.c create mode 100644 lib/eal/common/rte_seqlock.c create mode 100644 lib/eal/include/rte_seqlock.h diff --git a/app/test/meson.build b/app/test/meson.build index 5fc1dd1b7b..5e418e8766 100644 --- a/app/test/meson.build +++ b/app/test/meson.build @@ -125,6 +125,7 @@ test_sources = files( 'test_rwlock.c', 'test_sched.c', 'test_security.c', +'test_seqlock.c', 'test_service_cores.c', 'test_spinlock.c', 'test_stack.c', @@ -214,6 +215,7 @@ fast_tests = [ ['rwlock_rde_wro_autotest', true], ['sched_autotest', true], ['security_autotest', false], +['seqlock_autotest', true], ['spinlock_autotest', true], ['stack_autotest', false], ['stack_lf_au
Re: [PATCH v2] eal: add seqlock
On 2022-04-02 02:52, Stephen Hemminger wrote: On Thu, 31 Mar 2022 16:53:00 +0200 Ola Liljedahl wrote: From: Ola Liljedahl To: Mattias Rönnblom , "dev@dpdk.org" Cc: Thomas Monjalon , David Marchand , Onar Olsen , "honnappa.nagaraha...@arm.com" , "n...@arm.com" , "konstantin.anan...@intel.com" , "m...@smartsharesystems.com" , "step...@networkplumber.org" Subject: Re: [PATCH v2] eal: add seqlock Date: Thu, 31 Mar 2022 16:53:00 +0200 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.7.0 (Thunderbird suddenly refuses to edit in plain text mode, hope the mail gets sent as text anyway) On 3/31/22 15:38, Mattias Rönnblom wrote: On 2022-03-31 11:04, Ola Liljedahl wrote: On 3/31/22 09:46, Mattias Rönnblom wrote: On 2022-03-30 16:26, Mattias Rönnblom wrote: Should the rte_seqlock_read_retry() be called rte_seqlock_read_end(), or some third alternative? I wanted to make clear it's not just a "release the lock" function. You could use the|||__attribute__((warn_unused_result)) annotation to make clear the return value cannot be ignored, although I'm not sure DPDK ever use that attribute. We have to decide how to use the seqlock API from the application perspective. Your current proposal: do { sn = rte_seqlock_read_begin(&seqlock) //read protected data } while (rte_seqlock_read_retry(&seqlock, sn)); or perhaps sn = rte_seqlock_read_lock(&seqlock); do { //read protected data } while (!rte_seqlock_read_tryunlock(&seqlock, &sn)); Tryunlock should signal to the user that the unlock operation might not succeed and something needs to be repeated. I like that your proposal is consistent with rwlock API, although I tend to think about a seqlock more like an arbitrary-size atomic load/store, where begin() is the beginning of the read transaction. I can see the evolution of an application where is starts to use plain spin locks, moves to reader/writer locks for better performance and eventually moves to seqlocks. The usage is the same, only the characteristics (including performance) differ. The semantics of seqlock in DPDK must be the same as what Linux kernel does or you are asking for trouble. It is not a reader-writer lock in traditional sense. Does "semantics" here including the naming of the functions? The overall semantics will be the same, except the kernel has a bunch of variants with different kind of write-side synchronization, if I recall correctly. I'll try to summarize the options as I see them: Option A: (PATCH v3): rte_seqlock_read_lock() rte_seqlock_read_tryunlock() /* with built-in "read restart" */ rte_seqlock_write_lock() rte_seqlock_write_unlock() Option B: (Linux kernel-style naming): rte_seqlock_read_begin() rte_seqlock_read_end() rte_seqlock_write_begin() rte_seqlock_write_end() A combination, acknowledging there's a lock on the writer side, but not on the read side. Option C: rte_seqlock_read_begin() rte_seqlock_read_retry() rte_seqlock_write_lock() rte_seqlock_write_unlock()
Re: [PATCH v3] eal: add seqlock
I missed some of your comments. On 2022-04-02 02:21, Honnappa Nagarahalli wrote: + * Example usage: + * @code{.c} + * #define MAX_Y_LEN (16) + * // Application-defined example data structure, protected by a seqlock. + * struct config { + * rte_seqlock_t lock; + * int param_x; + * char param_y[MAX_Y_LEN]; + * }; + * + * // Accessor function for reading config fields. + * void + * config_read(const struct config *config, int *param_x, char +*param_y) + * { + * // Temporary variables, just to improve readability. I think the above comment is not necessary. It is beneficial to copy the protected data to keep the read side critical section small. The data here would be copied into the buffers supplied by config_read() anyways, so it's a copy regardless. + * int tentative_x; + * char tentative_y[MAX_Y_LEN]; + * uint32_t sn; + * + * sn = rte_seqlock_read_lock(&config->lock); + * do { + * // Loads may be atomic or non-atomic, as in this example. + * tentative_x = config->param_x; + * strcpy(tentative_y, config->param_y); + * } while (!rte_seqlock_read_tryunlock(&config->lock, &sn)); + * // An application could skip retrying, and try again later, if + * // progress is possible without the data. + * + * *param_x = tentative_x; + * strcpy(param_y, tentative_y); + * } + * + * // Accessor function for writing config fields. + * void + * config_update(struct config *config, int param_x, const char +*param_y) + * { + * rte_seqlock_write_lock(&config->lock); + * // Stores may be atomic or non-atomic, as in this example. + * config->param_x = param_x; + * strcpy(config->param_y, param_y); + * rte_seqlock_write_unlock(&config->lock); + * } + * @endcode + * + * @see + * https://en.wikipedia.org/wiki/Seqlock. + */ + +#include +#include + +#include +#include +#include + +/** + * The RTE seqlock type. + */ +typedef struct { + uint32_t sn; /**< A sequence number for the protected data. */ + rte_spinlock_t lock; /**< Spinlock used to serialize writers. */ } Suggest using ticket lock for the writer side. It should have low overhead when there is a single writer, but provides better functionality when there are multiple writers. Is a seqlock the synchronization primitive of choice for high-contention cases? I would say no, but I'm not sure what you would use instead.
Re: [PATCH v3] eal: add seqlock
On 2022-04-02 20:15, Ola Liljedahl wrote: On 4/1/22 17:07, Mattias Rönnblom wrote: + +/** + * End a read-side critical section. + * + * A call to this function marks the end of a read-side critical + * section, for @p seqlock. The application must supply the sequence + * number produced by the corresponding rte_seqlock_read_lock() (or, + * in case of a retry, the rte_seqlock_tryunlock()) call. + * + * After this function has been called, the caller should not access + * the protected data. + * + * In case this function returns true, the just-read data was + * consistent and the set of atomic and non-atomic load operations + * performed between rte_seqlock_read_lock() and + * rte_seqlock_read_tryunlock() were atomic, as a whole. + * + * In case rte_seqlock_read_tryunlock() returns false, the data was + * modified as it was being read and may be inconsistent, and thus + * should be discarded. The @p begin_sn is updated with the + * now-current sequence number. + * + * @param seqlock + * A pointer to the seqlock. + * @param begin_sn + * The seqlock sequence number returned by + * rte_seqlock_read_lock() (potentially updated in subsequent + * rte_seqlock_read_tryunlock() calls) for this critical section. + * @return + * true or false, if the just-read seqlock-protected data was consistent + * or inconsistent, respectively, at the time it was read. + * + * @see rte_seqlock_read_lock() + */ +__rte_experimental +static inline bool +rte_seqlock_read_tryunlock(const rte_seqlock_t *seqlock, uint32_t *begin_sn) +{ + uint32_t end_sn; + + /* make sure the data loads happens before the sn load */ + rte_atomic_thread_fence(__ATOMIC_ACQUIRE); + + end_sn = __atomic_load_n(&seqlock->sn, __ATOMIC_RELAXED); Since we are reading and potentially returning the sequence number here (repeating the read of the protected data), we need to use load-acquire. I assume it is not expected that the user will call rte_seqlock_read_lock() again. Good point. Seeing this implementation, I might actually prefer the original implementation, I think it is cleaner. Me too. But I would like for the begin function also to wait for an even sequence number, the end function would only have to check for same sequence number, this might improve performance a little bit as readers won't perform one or several broken reads while a write is in progress. The function names are a different thing though. The odd sn should be a rare case, if the seqlock is used for relatively low frequency update scenarios, which is what I think it should be designed for. Waiting for an even sn in read_begin() would exclude the option for the caller to defer reading the new data to same later time, in case it's being written. That in turn would force even a single writer to make sure its thread is not preempted, or risk blocking all lcore worker cores attempting to read the protected data. You could complete the above API with a read_trybegin() function to address that issue, for those who care, but that would force some extra complexity on the user. The writer side behaves much more like a lock with mutual exclusion so write_lock/write_unlock makes sense. + + if (unlikely(end_sn & 1 || *begin_sn != end_sn)) { + *begin_sn = end_sn; + return false; + } + + return true; +} +