On Mon, May 11, 2026 at 05:08:50PM +0200, Maxime Peim wrote:
> In eSwitch mode, the async (template) flow creation path automatically
> prepends implicit match items to scope flow rules to the correct
> representor port:
> - Ingress: REPRESENTED_PORT item matching dev->data->port_id
> - Egress: REG_C_0 TAG item matching the port's tx tag value
>
> The sync path (flow_hw_list_create) was missing this logic, causing all
> flow rules created via the non-template API to match traffic from all
> ports rather than being scoped to the specific representor.
>
> Add the same implicit item prepending to flow_hw_list_create, right
> after pattern validation and before any branching (sample/RSS/single/
> prefix), mirroring the behavior of flow_hw_pattern_template_create
> and flow_hw_get_rule_items. The ingress case prepends
> REPRESENTED_PORT with the current port_id; the egress case prepends
> MLX5_RTE_FLOW_ITEM_TYPE_TAG with REG_C_0 value/mask (skipped when
> user provides an explicit SQ item).
>
> Also fix a pre-existing bug where 'return split' on metadata split
> failure returned a negative int cast to uintptr_t, which callers
> would treat as a valid flow handle instead of an error.
>
> Fixes: e38776c36c8a ("net/mlx5: introduce HWS for non-template flow API")
> Fixes: 821a6a5cc495 ("net/mlx5: add metadata split for compatibility")
> Signed-off-by: Maxime Peim <[email protected]>
> ---
> v3:
> - Factor the implicit-item prepend logic out of
> flow_hw_pattern_template_create() into a new helper
> flow_hw_adjust_pattern() and reuse it from flow_hw_list_create(),
> instead of duplicating the prepend logic inline in the sync path.
> - Zero-initialize item_flags in both callers. The validator is
> read-modify-write on item_flags (reads MLX5_FLOW_LAYER_TUNNEL on
> the first iteration), so leaving it uninitialized was UB.
> - Call __flow_hw_pattern_validate() with nt_flow=true from the sync
> path (was effectively nt_flow=false via the wrapper), restoring the
> previous behavior that skips GENEVE_OPT TLV parser validation on
> the non-template path.
> - Document flow_hw_adjust_pattern(): the dual role of the nt_flow
> parameter (template spec-left-zero vs. sync spec-filled + validator
> flag), the three-way return, and the caller's ownership of
> *copied_items across every exit path.
> - Clarify the "omitting implicit REG_C_0 match" debug log now that
> the helper runs on both the template and sync paths.
> - Add Fixes: tags for the two original commits.
>
> v4:
> - Fix items in case splitted metadata are not needed.
>
> drivers/net/mlx5/mlx5_flow_hw.c | 194 ++++++++++++++++++++++----------
> 1 file changed, 132 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_flow_hw.c b/drivers/net/mlx5/mlx5_flow_hw.c
> index bca5b2769e..6b3fcb43a7 100644
> --- a/drivers/net/mlx5/mlx5_flow_hw.c
> +++ b/drivers/net/mlx5/mlx5_flow_hw.c
> @@ -9255,33 +9255,40 @@ pattern_template_validate(struct rte_eth_dev *dev,
> return -ret;
> }
>
> -/**
> - * Create flow item template.
> +/*
> + * Validate the user-supplied items and, in eSwitch mode, prepend the
> implicit
> + * scoping item so the rule/template is bound to the current representor
> port:
> + * - ingress -> RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT (dev->data->port_id)
> + * - egress -> MLX5_RTE_FLOW_ITEM_TYPE_TAG on REG_C_0 (tx vport tag),
> + * skipped when the user already supplied an SQ item.
> *
> - * @param[in] dev
> - * Pointer to the rte_eth_dev structure.
> - * @param[in] attr
> - * Pointer to the item template attributes.
> - * @param[in] items
> - * The template item pattern.
> - * @param[out] error
> - * Pointer to error structure.
> + * @param nt_flow
> + * Selects between the two call paths that share this helper:
> + * false -> pattern template creation (async API). The prepended item's
> + * spec is left zeroed so mlx5dr matches any value; the live
> + * port_id / tx-tag value is substituted later by
> + * flow_hw_get_rule_items() at rule-create time.
> + * true -> sync (non-template) flow creation. The prepended item's spec
> + * is filled immediately with the live values, and the flag is
> + * forwarded to __flow_hw_pattern_validate() so that validation
> + * paths gated on nt_flow (e.g. GENEVE_OPT TLV parser creation)
> + * take the non-template branch.
> *
> - * @return
> - * Item template pointer on success, NULL otherwise and rte_errno is set.
> + * Return / ownership:
> + * - NULL on validation or allocation failure (error populated).
> + * - `items` unchanged when no prepending is required; *copied_items ==
> NULL.
> + * - A newly-allocated array otherwise; also stored in *copied_items. The
> + * caller must mlx5_free(*copied_items) on every path (it is safe to call
> + * with NULL). Do not free the returned pointer directly.
> */
> -static struct rte_flow_pattern_template *
> -flow_hw_pattern_template_create(struct rte_eth_dev *dev,
> - const struct rte_flow_pattern_template_attr *attr,
> - const struct rte_flow_item items[],
> - bool external,
> - struct rte_flow_error *error)
> +static const struct rte_flow_item *
> +flow_hw_adjust_pattern(struct rte_eth_dev *dev, const struct
> rte_flow_pattern_template_attr *attr,
> + bool nt_flow, const struct rte_flow_item *items,
> uint64_t *item_flags,
> + uint64_t *nb_items, struct rte_flow_item **copied_items,
> + struct rte_flow_error *error)
It seems that there's an issue with dangling pointer in
flow_hw_adjust_pattern():
- flow_hw_adjust_pattern() defines TAG/REPRESENTED_PORT item spec/mask on
its local stack frame,
- If prepending is needed, then flow_hw_prepend_item() will copy only
the rte_flow_item struct. spec/mask is not copied.
- After flow_hw_adjust_pattern() finishes,
"copied_items" will have an item with spec/mask pointing to invalid stack
data.
This wasn't previously a problem in flow_hw_pattern_template_create(),
because items' spec/mask were only needed during the duration of that
function.
Could you please fix the above? It'll require extending amount of memory
allocated by flow_hw_prepend_item(), ideally everything should be done
in single allocation, so freeing copied_items would still require
single call to mlx5_free().
ASAN, with detect_stack_use_after_return enabled, report is below.
# repro steps
meson setup build -Dbuildtype=debug -Db_sanitize=address
-Denable_drivers=bus/auxiliary,common/mlx5,net/mlx5 -Denable_apps=test-pmd
meson compile -C build -j $(nproc --ignore 1)
env ASAN_OPTIONS=detect_stack_use_after_return=1
./build/app/dpdk-testpmd -a 08:00.0,dv_flow_en=2,representor=vf0-1 --
--flow-isolate-all -i
testpmd> flow create 0 group 0 priority 0 ingress pattern end actions
jump group 1 / end
# ASAN report
=================================================================
==59105==ERROR: AddressSanitizer: stack-use-after-return on address
0x71e744675730 at pc 0x5d70bed42c6d bp 0x7fff52519b40 sp 0x7fff52519b30
READ of size 2 at 0x71e744675730 thread T0
#0 0x5d70bed42c6c in flow_dv_translate_item_represented_port
../drivers/net/mlx5/mlx5_flow_dv.c:11059
#1 0x5d70bed5c9b1 in flow_dv_translate_items
../drivers/net/mlx5/mlx5_flow_dv.c:14301
#2 0x5d70bed60620 in flow_dv_translate_items_nta
../drivers/net/mlx5/mlx5_flow_dv.c:14646
#3 0x5d70bed60fb5 in mlx5_flow_dv_translate_items_hws_impl
../drivers/net/mlx5/mlx5_flow_dv.c:14714
#4 0x5d70c1288ea2 in mlx5_flow_hw_create_flow
../drivers/net/mlx5/mlx5_flow_hw.c:14134
#5 0x5d70c128e45c in flow_hw_list_create
../drivers/net/mlx5/mlx5_flow_hw.c:14394
#6 0x5d70beca71b9 in mlx5_flow_list_create
../drivers/net/mlx5/mlx5_flow.c:8068
#7 0x5d70beca6e33 in mlx5_flow_create
../drivers/net/mlx5/mlx5_flow.c:8044
#8 0x5d70be95670e in rte_flow_create ../lib/ethdev/rte_flow.c:426
#9 0x5d70bdcaf19a in port_flow_create ../app/test-pmd/config.c:3869
#10 0x5d70bdc3983d in cmd_flow_parsed
../app/test-pmd/cmdline_flow.c:13564
#11 0x5d70bdc3a376 in cmd_flow_cb
../app/test-pmd/cmdline_flow.c:13634
#12 0x5d70be8ad00a in __cmdline_parse
../lib/cmdline/cmdline_parse.c:296
#13 0x5d70be8ad0b8 in cmdline_parse
../lib/cmdline/cmdline_parse.c:305
#14 0x5d70be8a87ef in cmdline_valid_buffer
../lib/cmdline/cmdline.c:25
#15 0x5d70be8b405e in rdline_char_in
../lib/cmdline/cmdline_rdline.c:470
#16 0x5d70be8a9097 in cmdline_in ../lib/cmdline/cmdline.c:154
#17 0x5d70be8a932f in cmdline_interact ../lib/cmdline/cmdline.c:202
#18 0x5d70bdc12af6 in prompt ../app/test-pmd/cmdline.c:14586
#19 0x5d70bdd5d636 in main ../app/test-pmd/testpmd.c:4792
#20 0x71e746c2a1c9 in __libc_start_call_main
../sysdeps/nptl/libc_start_call_main.h:58
#21 0x71e746c2a28a in __libc_start_main_impl ../csu/libc-start.c:360
#22 0x5d70bdbc0944 in _start
(/auto/swgwork9/dsosnowski/code/nvidia/dpdk/build/app/dpdk-testpmd+0x358944)
(BuildId: 04ff6d26a7ff0a65637cb32f087bae007e00c965)
Address 0x71e744675730 is located in stack of thread T0 at offset 48 in
frame
#0 0x5d70c115c601 in flow_hw_adjust_pattern
../drivers/net/mlx5/mlx5_flow_hw.c:9289
This frame has 5 object(s):
[48, 50) 'port_spec' (line 9291) <== Memory access at offset 48 is
inside this variable
[64, 72) 'tag_v' (line 9296)
[96, 104) 'tag_m' (line 9300)
[128, 160) 'port' (line 9292)
[192, 224) 'tag' (line 9304)
HINT: this may be a false positive if your program uses some custom
stack unwind mechanism, swapcontext or vfork
(longjmp and C++ exceptions *are* supported)
SUMMARY: AddressSanitizer: stack-use-after-return
../drivers/net/mlx5/mlx5_flow_dv.c:11059 in
flow_dv_translate_item_represented_port
Shadow bytes around the buggy address:
0x71e744675480: f5 f5 f5 f5 00 00 00 00 00 00 00 00 00 00 00 00
0x71e744675500: f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5
0x71e744675580: f5 f5 f5 f5 f5 f5 f5 f5 00 00 00 00 00 00 00 00
0x71e744675600: f1 f1 f1 f1 f1 f1 02 f2 00 f2 f2 f2 00 f2 f2 f2
0x71e744675680: 00 f2 f2 f2 00 f3 f3 f3 00 00 00 00 00 00 00 00
=>0x71e744675700: f5 f5 f5 f5 f5 f5[f5]f5 f5 f5 f5 f5 f5 f5 f5 f5
0x71e744675780: f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5 f5
0x71e744675800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x71e744675880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x71e744675900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x71e744675980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
==59105==ABORTING
> {
> struct mlx5_priv *priv = dev->data->dev_private;
> - struct rte_flow_pattern_template *it;
> - struct rte_flow_item *copied_items = NULL;
> - const struct rte_flow_item *tmpl_items;
> - uint64_t orig_item_nb, item_flags = 0;
> + struct rte_flow_item_ethdev port_spec = {.port_id = dev->data->port_id};
> struct rte_flow_item port = {
> .type = RTE_FLOW_ITEM_TYPE_REPRESENTED_PORT,
> .mask = &rte_flow_item_ethdev_mask,
> @@ -9298,39 +9305,89 @@ flow_hw_pattern_template_create(struct rte_eth_dev
> *dev,
> .type = (enum rte_flow_item_type)MLX5_RTE_FLOW_ITEM_TYPE_TAG,
> .spec = &tag_v,
> .mask = &tag_m,
> - .last = NULL
> + .last = NULL,
> };
> - int it_items_size;
> - unsigned int i = 0;
> int rc;
>
> + if (!copied_items || !item_flags || !nb_items)
> + return NULL;
> +
> + if (nt_flow) {
> + port.spec = &port_spec;
> + tag_v.data = flow_hw_tx_tag_regc_value(dev);
> + }
> +
> + /*
> + * item_flags must be zero-initialized: __flow_hw_pattern_validate()
> + * OR-accumulates bits into it and reads it (MLX5_FLOW_LAYER_TUNNEL)
> + * on the first iteration.
> + */
> + *item_flags = 0;
> +
> /* Validate application items only */
> - rc = flow_hw_pattern_validate(dev, attr, items, &item_flags, error);
> + rc = __flow_hw_pattern_validate(dev, attr, items, item_flags, nt_flow,
> error);
> if (rc < 0)
> return NULL;
> - orig_item_nb = rc;
> - if (priv->sh->config.dv_esw_en &&
> - attr->ingress && !attr->egress && !attr->transfer) {
> - copied_items = flow_hw_prepend_item(items, orig_item_nb, &port,
> error);
> - if (!copied_items)
> + *nb_items = rc;
> +
> + if (priv->sh->config.dv_esw_en && attr->ingress && !attr->egress &&
> !attr->transfer) {
> + *copied_items = flow_hw_prepend_item(items, *nb_items, &port,
> error);
> + if (!*copied_items)
> return NULL;
> - tmpl_items = copied_items;
> - } else if (priv->sh->config.dv_esw_en &&
> - !attr->ingress && attr->egress && !attr->transfer) {
> - if (item_flags & MLX5_FLOW_ITEM_SQ) {
> - DRV_LOG(DEBUG, "Port %u omitting implicit REG_C_0 match
> for egress "
> - "pattern template", dev->data->port_id);
> - tmpl_items = items;
> - goto setup_pattern_template;
> + return *copied_items;
> + } else if (priv->sh->config.dv_esw_en && !attr->ingress && attr->egress
> &&
> + !attr->transfer) {
> + if (*item_flags & MLX5_FLOW_ITEM_SQ) {
> + DRV_LOG(DEBUG,
> + "Port %u: explicit SQ item present, omitting
> implicit "
> + "REG_C_0 match for egress pattern",
> + dev->data->port_id);
> + return items;
> }
> - copied_items = flow_hw_prepend_item(items, orig_item_nb, &tag,
> error);
> - if (!copied_items)
> + *copied_items = flow_hw_prepend_item(items, *nb_items, &tag,
> error);
> + if (!*copied_items)
> return NULL;
> - tmpl_items = copied_items;
> - } else {
> - tmpl_items = items;
> + return *copied_items;
> }
> -setup_pattern_template:
> + return items;
> +}
> +
> +/**
> + * Create flow item template.
> + *
> + * @param[in] dev
> + * Pointer to the rte_eth_dev structure.
> + * @param[in] attr
> + * Pointer to the item template attributes.
> + * @param[in] items
> + * The template item pattern.
> + * @param[out] error
> + * Pointer to error structure.
> + *
> + * @return
> + * Item template pointer on success, NULL otherwise and rte_errno is set.
> + */
> +static struct rte_flow_pattern_template *
> +flow_hw_pattern_template_create(struct rte_eth_dev *dev,
> + const struct rte_flow_pattern_template_attr *attr,
> + const struct rte_flow_item items[],
> + bool external,
> + struct rte_flow_error *error)
> +{
> + struct mlx5_priv *priv = dev->data->dev_private;
> + struct rte_flow_pattern_template *it;
> + struct rte_flow_item *copied_items = NULL;
> + const struct rte_flow_item *tmpl_items;
> + int it_items_size;
> + uint64_t orig_item_nb, item_flags;
> + unsigned int i = 0;
> + int rc;
> +
> + tmpl_items = flow_hw_adjust_pattern(dev, attr, false, items,
> &orig_item_nb, &item_flags,
> + &copied_items, error);
This is something I missed on v3, sorry about that.
flow_hw_adjust_pattern() takes:
- "item_flags" as 5th argument,
- "nb_items" as 6th argument.
But both call sites of flow_hw_adjust_pattern() pass them
in reverse order.
Please rearrange the call sites.
> + if (!tmpl_items)
> + return NULL;
> +
> it = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*it), 0, SOCKET_ID_ANY);
> if (!it) {
> rte_flow_error_set(error, ENOMEM,
> @@ -14272,7 +14329,6 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> struct rte_flow_hw *prfx_flow = NULL;
> const struct rte_flow_action *qrss = NULL;
> const struct rte_flow_action *mark = NULL;
> - uint64_t item_flags = 0;
> uint64_t action_flags = mlx5_flow_hw_action_flags_get(actions, &qrss,
> &mark,
> &encap_idx,
> &actions_n, error);
> struct mlx5_flow_hw_split_resource resource = {
> @@ -14289,20 +14345,27 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> .egress = attr->egress,
> .transfer = attr->transfer,
> };
> -
> - /* Validate application items only */
> - ret = __flow_hw_pattern_validate(dev, &pattern_template_attr, items,
> - &item_flags, true, error);
> - if (ret < 0)
> - return 0;
> + struct rte_flow_item *copied_items = NULL;
> + const struct rte_flow_item *prepend_items;
> + uint64_t orig_item_nb, item_flags;
>
> RTE_SET_USED(encap_idx);
> if (!error)
> error = &shadow_error;
> +
> + prepend_items = flow_hw_adjust_pattern(dev, &pattern_template_attr,
> true, items,
> + &orig_item_nb, &item_flags,
> &copied_items, error);
> + if (!prepend_items)
> + return 0;
> +
> split = mlx5_flow_nta_split_metadata(dev, attr, actions, qrss,
> action_flags,
> actions_n, external, &resource,
> error);
> - if (split < 0)
> - return split;
> + if (split < 0) {
> + mlx5_free(copied_items);
> + return 0;
> + } else if (!split) {
> + resource.suffix.items = prepend_items;
> + }
>
> /* Update the metadata copy table - MLX5_FLOW_MREG_CP_TABLE_GROUP */
> if (((attr->ingress && attr->group != MLX5_FLOW_MREG_CP_TABLE_GROUP) ||
> @@ -14313,23 +14376,26 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> goto free;
> }
> if (action_flags & MLX5_FLOW_ACTION_SAMPLE) {
> - flow = mlx5_nta_sample_flow_list_create(dev, type, attr, items,
> actions,
> + flow = mlx5_nta_sample_flow_list_create(dev, type, attr,
> prepend_items, actions,
> item_flags,
> action_flags, error);
> - if (flow != NULL)
> + if (flow != NULL) {
> + mlx5_free(copied_items);
> return (uintptr_t)flow;
> + }
> goto free;
> }
> if (action_flags & MLX5_FLOW_ACTION_RSS) {
> const struct rte_flow_action_rss
> *rss_conf = mlx5_flow_nta_locate_rss(dev, actions,
> error);
> - flow = mlx5_flow_nta_handle_rss(dev, attr, items, actions,
> rss_conf,
> - item_flags, action_flags,
> external,
> - type, error);
> + flow = mlx5_flow_nta_handle_rss(dev, attr, prepend_items,
> actions, rss_conf,
> + item_flags, action_flags,
> external, type, error);
> if (flow) {
> flow->nt2hws->rix_mreg_copy = cpy_idx;
> cpy_idx = 0;
> - if (!split)
> + if (!split) {
> + mlx5_free(copied_items);
> return (uintptr_t)flow;
> + }
> goto prefix_flow;
> }
> goto free;
> @@ -14343,12 +14409,14 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> if (flow) {
> flow->nt2hws->rix_mreg_copy = cpy_idx;
> cpy_idx = 0;
> - if (!split)
> + if (!split) {
> + mlx5_free(copied_items);
> return (uintptr_t)flow;
> + }
> /* Fall Through to prefix flow creation. */
> }
> prefix_flow:
> - ret = mlx5_flow_hw_create_flow(dev, type, attr, items,
> resource.prefix.actions,
> + ret = mlx5_flow_hw_create_flow(dev, type, attr, prepend_items,
> resource.prefix.actions,
> item_flags, action_flags, external,
> &prfx_flow, error);
> if (ret)
> goto free;
> @@ -14357,6 +14425,7 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> flow->nt2hws->chaned_flow = 1;
> SLIST_INSERT_AFTER(prfx_flow, flow, nt2hws->next);
> mlx5_flow_nta_split_resource_free(dev, &resource);
> + mlx5_free(copied_items);
> return (uintptr_t)prfx_flow;
> }
> free:
> @@ -14368,6 +14437,7 @@ static uintptr_t flow_hw_list_create(struct
> rte_eth_dev *dev,
> mlx5_flow_nta_del_copy_action(dev, cpy_idx);
> if (split > 0)
> mlx5_flow_nta_split_resource_free(dev, &resource);
> + mlx5_free(copied_items);
> return 0;
> }
>