[PATCH v2] samples: bpf: fix style in bpf_load
This commit fixes style problem in samples/bpf/bpf_load.c Styles that have been changed are: - Magic string use of 'DEBUGFS' - Useless zero initialization of a global variable - Minor style fix with whitespace Signed-off-by: Daniel T. Lee --- Changes in v2: - Fix string concatenation from build-time to compile-time. samples/bpf/bpf_load.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index eae7b635343d..1734ade04f7f 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -40,7 +40,7 @@ int prog_cnt; int prog_array_fd = -1; struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count = 0; +int map_data_count; static int populate_prog_array(const char *event, int prog_fd) { @@ -65,7 +65,7 @@ static int write_kprobe_events(const char *val) else flags = O_WRONLY | O_APPEND; - fd = open("/sys/kernel/debug/tracing/kprobe_events", flags); + fd = open(DEBUGFS "kprobe_events", flags); ret = write(fd, val, strlen(val)); close(fd); @@ -490,8 +490,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, /* Verify no newer features were requested */ if (validate_zero) { - addr = (unsigned char*) def + map_sz_copy; - end = (unsigned char*) def + map_sz_elf; + addr = (unsigned char *) def + map_sz_copy; + end = (unsigned char *) def + map_sz_elf; for (; addr < end; addr++) { if (*addr != 0) { free(sym); -- 2.17.1
[RESEND][PATCH] Fix MACsec kernel panics, oopses and bugs
MACsec causes oopses followed by a kernel panic when attached directly or indirectly to a bridge. It causes erroneous checksum messages when attached to vxlan. When I did investigate I did find skb leaks, apparent skb mis-handling and superfluous code. The attached patch fixes all MACsec misbehaviour I could find. As I am no kernel developer somebody with sufficient kernel network knowledge should verify and correct the patch where necessary. Signed-off-by: Andreas Steinmetz --- linux.orig/drivers/net/macsec.c 2019-05-17 11:00:13.631121950 +0200 +++ linux/drivers/net/macsec.c 2019-05-17 18:41:41.333119772 +0200 @@ -911,6 +911,9 @@ static void macsec_decrypt_done(struct c macsec_extra_len(macsec_skb_cb(skb)->has_sci)); macsec_reset_skb(skb, macsec->secy.netdev); + /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */ + skb->csum_complete_sw = 1; + len = skb->len; if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) count_rx(dev, len); @@ -938,9 +941,6 @@ static struct sk_buff *macsec_decrypt(st u16 icv_len = secy->icv_len; macsec_skb_cb(skb)->valid = false; - skb = skb_share_check(skb, GFP_ATOMIC); - if (!skb) - return ERR_PTR(-ENOMEM); ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { @@ -972,11 +972,6 @@ static struct sk_buff *macsec_decrypt(st aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - aead_request_free(req); - return ERR_PTR(-ENOMEM); - } } else { /* integrity only: all headers + data authenticated */ aead_request_set_crypt(req, sg, sg, icv_len, iv); @@ -1102,20 +1097,12 @@ static rx_handler_result_t macsec_handle return RX_HANDLER_PASS; } - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - *pskb = NULL; - return RX_HANDLER_CONSUMED; - } - pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); if (!pulled_sci) { if (!pskb_may_pull(skb, macsec_extra_len(false))) goto drop_direct; } - hdr = macsec_ethhdr(skb); - /* Frames with a SecTAG that has the TCI E bit set but the C * bit clear are discarded, as this reserved encoding is used * to identify frames with a SecTAG that are not to be @@ -1130,6 +1117,12 @@ static rx_handler_result_t macsec_handle goto drop_direct; } + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return RX_HANDLER_CONSUMED; + + hdr = macsec_ethhdr(skb); + /* ethernet header is part of crypto processing */ skb_push(skb, ETH_HLEN); @@ -1213,22 +1206,22 @@ static rx_handler_result_t macsec_handle /* Disabled && !changed text => skip validation */ if (hdr->tci_an & MACSEC_TCI_C || - secy->validate_frames != MACSEC_VALIDATE_DISABLED) + secy->validate_frames != MACSEC_VALIDATE_DISABLED) { skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); - if (IS_ERR(skb)) { - /* the decrypt callback needs the reference */ - if (PTR_ERR(skb) != -EINPROGRESS) { - macsec_rxsa_put(rx_sa); - macsec_rxsc_put(rx_sc); + if (IS_ERR(skb)) { + /* the decrypt callback needs the reference */ + if (PTR_ERR(skb) != -EINPROGRESS) { + macsec_rxsa_put(rx_sa); + macsec_rxsc_put(rx_sc); + } + rcu_read_unlock(); + return RX_HANDLER_CONSUMED; } - rcu_read_unlock(); - *pskb = NULL; - return RX_HANDLER_CONSUMED; - } - if (!macsec_post_decrypt(skb, secy, pn)) - goto drop; + if (!macsec_post_decrypt(skb, secy, pn)) + goto drop; + } deliver: macsec_finalize_skb(skb, secy->icv_len, @@ -1239,6 +1232,9 @@ deliver: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); + /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */ + skb->csum_complete_sw = 1; + ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, skb->len); @@ -1247,7 +1243,6 @@ deliver: rcu_read_unlock(); - *pskb = NULL; return RX_HANDLER_CONSUMED; drop: @@ -1257,7 +1252,6 @@ drop_nosa: rcu_read_unlock(); drop_direct: kfree_skb(skb); -
Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor
On Thu, 23 May 2019 at 08:39, Y Song wrote: > > On Wed, May 22, 2019 at 1:46 PM Björn Töpel wrote: > > > > On Wed, 22 May 2019 at 20:13, Y Song wrote: > > > > > > On Wed, May 22, 2019 at 2:25 AM Björn Töpel wrote: > > > > > > > > Add three tests to test_verifier/basic_instr that make sure that the > > > > high 32-bits of the destination register is cleared after an ALU32 > > > > and/or/xor. > > > > > > > > Signed-off-by: Björn Töpel > > > > > > I think the patch intends for bpf-next, right? The patch itself looks > > > good to me. > > > Acked-by: Yonghong Song > > > > > > > Thank you. Actually, it was intended for the bpf tree, as a test > > follow up for this [1] fix. > Then maybe you want to add a Fixes tag and resubmit? Hmm, I thought that adding tests were OK for non-next. Should the Fixes: tag for the test reflex the corresponding fixed code (in this case the RV JIT)? > > > > > > Cheers, > > Björn > > > > [1] > > https://lore.kernel.org/bpf/caj+hfnifkxkz8df7glbuqwa6+t6awrrrk6ow6m1nayetjd+...@mail.gmail.com/ > > > > > > --- > > > > .../selftests/bpf/verifier/basic_instr.c | 39 +++ > > > > 1 file changed, 39 insertions(+) > > > > > > > > diff --git a/tools/testing/selftests/bpf/verifier/basic_instr.c > > > > b/tools/testing/selftests/bpf/verifier/basic_instr.c > > > > index ed91a7b9a456..4d844089938e 100644 > > > > --- a/tools/testing/selftests/bpf/verifier/basic_instr.c > > > > +++ b/tools/testing/selftests/bpf/verifier/basic_instr.c > > > > @@ -132,3 +132,42 @@ > > > > .prog_type = BPF_PROG_TYPE_SCHED_CLS, > > > > .result = ACCEPT, > > > > }, > > > > +{ > > > > + "and32 reg zero extend check", > > > > + .insns = { > > > > + BPF_MOV64_IMM(BPF_REG_0, -1), > > > > + BPF_MOV64_IMM(BPF_REG_2, -2), > > > > + BPF_ALU32_REG(BPF_AND, BPF_REG_0, BPF_REG_2), > > > > + BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32), > > > > + BPF_EXIT_INSN(), > > > > + }, > > > > + .prog_type = BPF_PROG_TYPE_SCHED_CLS, > > > > + .result = ACCEPT, > > > > + .retval = 0, > > > > +}, > > > > +{ > > > > + "or32 reg zero extend check", > > > > + .insns = { > > > > + BPF_MOV64_IMM(BPF_REG_0, -1), > > > > + BPF_MOV64_IMM(BPF_REG_2, -2), > > > > + BPF_ALU32_REG(BPF_OR, BPF_REG_0, BPF_REG_2), > > > > + BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32), > > > > + BPF_EXIT_INSN(), > > > > + }, > > > > + .prog_type = BPF_PROG_TYPE_SCHED_CLS, > > > > + .result = ACCEPT, > > > > + .retval = 0, > > > > +}, > > > > +{ > > > > + "xor32 reg zero extend check", > > > > + .insns = { > > > > + BPF_MOV64_IMM(BPF_REG_0, -1), > > > > + BPF_MOV64_IMM(BPF_REG_2, 0), > > > > + BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_2), > > > > + BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32), > > > > + BPF_EXIT_INSN(), > > > > + }, > > > > + .prog_type = BPF_PROG_TYPE_SCHED_CLS, > > > > + .result = ACCEPT, > > > > + .retval = 0, > > > > +}, > > > > -- > > > > 2.20.1 > > > >
Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names
Hi David, On Wed, 22 May 2019 12:11:06 -0700 David Ahern wrote: > From: David Ahern > > The point of the pause-on-fail argument is to leave the setup as is after > a test fails to allow a user to debug why it failed. Move the cleanup > after posting the result to the user to make it so. > > Random names for the namespaces are not user friendly when trying to > debug a failure. Make them simpler and more direct for the tests. Run > cleanup at the beginning to ensure they are cleaned up if they already > exist. The reasons for picking per-instance unique names were: - one can run multiple instances of the script in parallel. I couldn't trigger any bug this way *so far*, though - cleanup might fail because of e.g. device reference count leaks (this happened quite frequently in the past), which are anyway visible in kernel logs. Unique names avoid the need to reboot Sure, it's a trade-off with usability, and I also see the value of having fixed names, so I'm fine with this too. I just wanted to make sure you considered these points. By the way, the comment to nsname() (that I would keep, it's still somewhat convenient) is now inconsistent. -- Stefano
Re: [PATCH net,stable 1/1] net: fec: fix the clk mismatch in failed_reset path
Hi Andy, On Thu, May 23, 2019 at 01:55:28AM +, Andy Duan wrote: > Fix the clk mismatch in the error path "failed_reset" because > below error path will disable clk_ahb and clk_ipg directly, it > should use pm_runtime_put_noidle() instead of pm_runtime_put() > to avoid to call runtime resume callback. > > Reported-by: Baruch Siach > Signed-off-by: Fugang Duan Tested-by: Baruch Siach Tested on SolidRun Hummingboard Pulse. Thanks. But please avoid sending patched in base64 encoded emails. Plaintext is much easier when dealing with 'git am'. baruch > --- > drivers/net/ethernet/freescale/fec_main.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/freescale/fec_main.c > b/drivers/net/ethernet/freescale/fec_main.c > index f63eb2b..848defa 100644 > --- a/drivers/net/ethernet/freescale/fec_main.c > +++ b/drivers/net/ethernet/freescale/fec_main.c > @@ -3555,7 +3555,7 @@ fec_probe(struct platform_device *pdev) > if (fep->reg_phy) > regulator_disable(fep->reg_phy); > failed_reset: > - pm_runtime_put(&pdev->dev); > + pm_runtime_put_noidle(&pdev->dev); > pm_runtime_disable(&pdev->dev); > failed_regulator: > clk_disable_unprepare(fep->clk_ahb); -- http://baruch.siach.name/blog/ ~. .~ Tk Open Systems =}ooO--U--Ooo{= - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -
[patch net-next v2] devlink: add warning in case driver does not set port type
From: Jiri Pirko Prevent misbehavior of drivers who would not set port type for longer period of time. Drivers should always set port type. Do WARN if that happens. Note that it is perfectly fine to temporarily not have the type set, during initialization and port type change. Signed-off-by: Jiri Pirko --- v1->v2: - Don't warn on DSA and CPU ports. --- include/net/devlink.h | 2 ++ net/core/devlink.c| 38 ++ 2 files changed, 40 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index 1c4adfb4195a..151eb930d329 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -64,6 +65,7 @@ struct devlink_port { enum devlink_port_type desired_type; void *type_dev; struct devlink_port_attrs attrs; + struct delayed_work type_warn_dw; }; struct devlink_sb_pool_info { diff --git a/net/core/devlink.c b/net/core/devlink.c index d43bc52b8840..9716a7f382cb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -5390,6 +5391,38 @@ void devlink_free(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_free); +static void devlink_port_type_warn(struct work_struct *work) +{ + WARN(true, "Type was not set for devlink port."); +} + +static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) +{ + /* Ignore CPU and DSA flavours. */ + return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; +} + +#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30) + +static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + /* Schedule a work to WARN in case driver does not set port +* type within timeout. +*/ + schedule_delayed_work(&devlink_port->type_warn_dw, + DEVLINK_PORT_TYPE_WARN_TIMEOUT); +} + +static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) +{ + if (!devlink_port_type_should_warn(devlink_port)) + return; + cancel_delayed_work_sync(&devlink_port->type_warn_dw); +} + /** * devlink_port_register - Register devlink port * @@ -5419,6 +5452,8 @@ int devlink_port_register(struct devlink *devlink, list_add_tail(&devlink_port->list, &devlink->port_list); INIT_LIST_HEAD(&devlink_port->param_list); mutex_unlock(&devlink->lock); + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -5433,6 +5468,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; + devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); mutex_lock(&devlink->lock); list_del(&devlink_port->list); @@ -5446,6 +5482,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, { if (WARN_ON(!devlink_port->registered)) return; + devlink_port_type_warn_cancel(devlink_port); spin_lock(&devlink_port->type_lock); devlink_port->type = type; devlink_port->type_dev = type_dev; @@ -5519,6 +5556,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); void devlink_port_type_clear(struct devlink_port *devlink_port) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); + devlink_port_type_warn_schedule(devlink_port); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -- 2.17.2
Re: [PATCH bpf-next 1/2] net: xdp: refactor XDP_QUERY_PROG{,_HW} to netdev
Jakub Kicinski writes: > On Wed, 22 May 2019 22:54:44 +0200, Björn Töpel wrote: >> > > Now, the same commands give: >> > > >> > > # ip link set dev eth0 xdp obj foo.o sec main >> > > # ip link set dev eth0 xdpgeneric off >> > > Error: native and generic XDP can't be active at the same time. >> > >> > I'm not clear why this change is necessary? It is a change in >> > behaviour, and if anything returning ENOENT would seem cleaner >> > in this case. >> >> To me, the existing behavior was non-intuitive. If most people *don't* >> agree, I'll remove this change. So, what do people think about this? >> :-) > > Having things start to fail after they were successful/ignored > is one of those ABI breakage types Linux and netdev usually takes > pretty seriously, unfortunately. Especially when motivation is > "it's more intuitive" :) > > If nobody chimes in please break out this behaviour change into > a commit of its own. Björn and I already had this discussion off-list. I think we ended up concluding that since it's not changing kernel *behaviour*, but just making an existing error explicit, it might be acceptable from an ABI breakage PoV. And I'd generally prefer explicit errors over silent failures. But yeah, can totally see why it could also be considered a breaking change... -Toke
[patch net-next 2/7] mlx5: Move firmware flash implementation to devlink
From: Jiri Pirko Benefit from the devlink flash update implementation and ethtool fallback to it and move firmware flash implementation there. Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 -- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 35 --- .../mellanox/mlx5/core/ipoib/ethtool.c| 9 - .../net/ethernet/mellanox/mlx5/core/main.c| 20 +++ 4 files changed, 20 insertions(+), 46 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3a183d690e23..4e417dfe4ee5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1074,8 +1074,6 @@ u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv *priv); u32 mlx5e_ethtool_get_rxfh_indir_size(struct mlx5e_priv *priv); int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv, struct ethtool_ts_info *info); -int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, - struct ethtool_flash *flash); void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv, struct ethtool_pauseparam *pauseparam); int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index dd764e0471f2..ea59097dd4f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1867,40 +1867,6 @@ static u32 mlx5e_get_priv_flags(struct net_device *netdev) return priv->channels.params.pflags; } -int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, - struct ethtool_flash *flash) -{ - struct mlx5_core_dev *mdev = priv->mdev; - struct net_device *dev = priv->netdev; - const struct firmware *fw; - int err; - - if (flash->region != ETHTOOL_FLASH_ALL_REGIONS) - return -EOPNOTSUPP; - - err = request_firmware_direct(&fw, flash->data, &dev->dev); - if (err) - return err; - - dev_hold(dev); - rtnl_unlock(); - - err = mlx5_firmware_flash(mdev, fw); - release_firmware(fw); - - rtnl_lock(); - dev_put(dev); - return err; -} - -static int mlx5e_flash_device(struct net_device *dev, - struct ethtool_flash *flash) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - - return mlx5e_ethtool_flash_device(priv, flash); -} - #ifndef CONFIG_MLX5_EN_RXNFC /* When CONFIG_MLX5_EN_RXNFC=n we only support ETHTOOL_GRXRINGS * otherwise this function will be defined from en_fs_ethtool.c @@ -1939,7 +1905,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = { #ifdef CONFIG_MLX5_EN_RXNFC .set_rxnfc = mlx5e_set_rxnfc, #endif - .flash_device = mlx5e_flash_device, .get_tunable = mlx5e_get_tunable, .set_tunable = mlx5e_set_tunable, .get_pauseparam= mlx5e_get_pauseparam, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c index 90cb50fe17fd..ebd81f6b556e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c @@ -122,14 +122,6 @@ static int mlx5i_get_ts_info(struct net_device *netdev, return mlx5e_ethtool_get_ts_info(priv, info); } -static int mlx5i_flash_device(struct net_device *netdev, - struct ethtool_flash *flash) -{ - struct mlx5e_priv *priv = mlx5i_epriv(netdev); - - return mlx5e_ethtool_flash_device(priv, flash); -} - enum mlx5_ptys_width { MLX5_PTYS_WIDTH_1X = 1 << 0, MLX5_PTYS_WIDTH_2X = 1 << 1, @@ -241,7 +233,6 @@ const struct ethtool_ops mlx5i_ethtool_ops = { .get_ethtool_stats = mlx5i_get_ethtool_stats, .get_ringparam = mlx5i_get_ringparam, .set_ringparam = mlx5i_set_ringparam, - .flash_device = mlx5i_flash_device, .get_channels = mlx5i_get_channels, .set_channels = mlx5i_set_channels, .get_coalesce = mlx5i_get_coalesce, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 61fa1d162d28..36acd0267a13 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1210,6 +1210,25 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup) return err; } +static int mlx5_devlink_flash_update(struct devlink *devlink, +const char *file_name, +const char *component, +struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink
[patch net-next 0/7] expose flash update status to user
From: Jiri Pirko When user is flashing device using devlink, he currenly does not see any information about what is going on, percentages, etc. Drivers, for example mlxsw and mlx5, have notion about the progress and what is happening. This patchset exposes this progress information to userspace. See this console recording which shows flashing FW on a Mellanox Spectrum device: https://asciinema.org/a/247926 Jiri Pirko (7): mlxsw: Move firmware flash implementation to devlink mlx5: Move firmware flash implementation to devlink mlxfw: Propagate error messages through extack devlink: allow driver to update progress of flash update mlxfw: Introduce status_notify op and call it to notify about the status mlxsw: Implement flash update status notifications netdevsim: implement fake flash updating with notifications drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 - .../ethernet/mellanox/mlx5/core/en_ethtool.c | 35 -- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 +- .../mellanox/mlx5/core/ipoib/ethtool.c| 9 -- .../net/ethernet/mellanox/mlx5/core/main.c| 20 .../ethernet/mellanox/mlx5/core/mlx5_core.h | 3 +- drivers/net/ethernet/mellanox/mlxfw/mlxfw.h | 11 +- .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 57 -- drivers/net/ethernet/mellanox/mlxsw/core.c| 15 +++ drivers/net/ethernet/mellanox/mlxsw/core.h| 3 + .../net/ethernet/mellanox/mlxsw/spectrum.c| 75 +++-- drivers/net/netdevsim/dev.c | 35 ++ include/net/devlink.h | 8 ++ include/uapi/linux/devlink.h | 5 + net/core/devlink.c| 102 ++ 15 files changed, 295 insertions(+), 91 deletions(-) -- 2.17.2
[patch net-next 1/7] mlxsw: Move firmware flash implementation to devlink
From: Jiri Pirko Benefit from the devlink flash update implementation and ethtool fallback to it and move firmware flash implementation there. Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/core.c| 15 ++ drivers/net/ethernet/mellanox/mlxsw/core.h| 3 ++ .../net/ethernet/mellanox/mlxsw/spectrum.c| 49 +-- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 6ee6de7f0160..a992a7c69b45 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1003,6 +1003,20 @@ static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink, return err; } +static int mlxsw_devlink_flash_update(struct devlink *devlink, + const char *file_name, + const char *component, + struct netlink_ext_ack *extack) +{ + struct mlxsw_core *mlxsw_core = devlink_priv(devlink); + struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver; + + if (!mlxsw_driver->flash_update) + return -EOPNOTSUPP; + return mlxsw_driver->flash_update(mlxsw_core, file_name, + component, extack); +} + static const struct devlink_ops mlxsw_devlink_ops = { .reload = mlxsw_devlink_core_bus_device_reload, .port_type_set = mlxsw_devlink_port_type_set, @@ -1019,6 +1033,7 @@ static const struct devlink_ops mlxsw_devlink_ops = { .sb_occ_port_pool_get = mlxsw_devlink_sb_occ_port_pool_get, .sb_occ_tc_port_bind_get= mlxsw_devlink_sb_occ_tc_port_bind_get, .info_get = mlxsw_devlink_info_get, + .flash_update = mlxsw_devlink_flash_update, }; static int diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index e3832cb5bdda..a44ad0fb9477 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -284,6 +284,9 @@ struct mlxsw_driver { unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, u32 *p_cur, u32 *p_max); + int (*flash_update)(struct mlxsw_core *mlxsw_core, + const char *file_name, const char *component, + struct netlink_ext_ack *extack); void (*txhdr_construct)(struct sk_buff *skb, const struct mlxsw_tx_info *tx_info); int (*resources_register)(struct mlxsw_core *mlxsw_core); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index dbb425717f5e..861a77538859 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -388,6 +388,27 @@ static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp) return 0; } +static int mlxsw_sp_flash_update(struct mlxsw_core *mlxsw_core, +const char *file_name, const char *component, +struct netlink_ext_ack *extack) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + const struct firmware *firmware; + int err; + + if (component) + return -EOPNOTSUPP; + + err = request_firmware_direct(&firmware, file_name, + mlxsw_sp->bus_info->dev); + if (err) + return err; + err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware); + release_firmware(firmware); + + return err; +} + int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp, unsigned int counter_index, u64 *packets, u64 *bytes) @@ -3155,31 +3176,6 @@ mlxsw_sp_port_set_link_ksettings(struct net_device *dev, return 0; } -static int mlxsw_sp_flash_device(struct net_device *dev, -struct ethtool_flash *flash) -{ - struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - const struct firmware *firmware; - int err; - - if (flash->region != ETHTOOL_FLASH_ALL_REGIONS) - return -EOPNOTSUPP; - - dev_hold(dev); - rtnl_unlock(); - - err = request_firmware_direct(&firmware, flash->data, &dev->dev); - if (err) - goto out; - err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware); - release_firmware(firmware); -out: - rtnl_lock(); - dev_put(dev); - return err; -} - static int mlxsw_sp_get_module_info(struct net_device *netdev,
[patch net-next 3/7] mlxfw: Propagate error messages through extack
From: Jiri Pirko Currently the error messages are printed to dmesg. Propagate them also to directly to user doing the flashing through extack. Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++-- .../net/ethernet/mellanox/mlx5/core/main.c| 2 +- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 3 +- drivers/net/ethernet/mellanox/mlxfw/mlxfw.h | 7 ++-- .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 33 +-- .../net/ethernet/mellanox/mlxsw/spectrum.c| 10 +++--- 6 files changed, 41 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 1ab6f7e3bec6..e8fedb307b2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -552,7 +552,8 @@ static const struct mlxfw_dev_ops mlx5_mlxfw_dev_ops = { }; int mlx5_firmware_flash(struct mlx5_core_dev *dev, - const struct firmware *firmware) + const struct firmware *firmware, + struct netlink_ext_ack *extack) { struct mlx5_mlxfw_dev mlx5_mlxfw_dev = { .mlxfw_dev = { @@ -571,5 +572,6 @@ int mlx5_firmware_flash(struct mlx5_core_dev *dev, return -EOPNOTSUPP; } - return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, firmware); + return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, + firmware, extack); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 36acd0267a13..52fcfc17fcc6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1226,7 +1226,7 @@ static int mlx5_devlink_flash_update(struct devlink *devlink, if (err) return err; - return mlx5_firmware_flash(dev, fw); + return mlx5_firmware_flash(dev, fw, extack); } static const struct devlink_ops mlx5_devlink_ops = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 22e69d4813e4..d4dd8c1ae55c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -184,7 +184,8 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) && \ MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj)) -int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw); +int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw, + struct netlink_ext_ack *extack); void mlx5e_init(void); void mlx5e_cleanup(void); diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h index 14c0c62f8e73..83286b90593f 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h @@ -5,6 +5,7 @@ #define _MLXFW_H #include +#include enum mlxfw_fsm_state { MLXFW_FSM_STATE_IDLE, @@ -67,11 +68,13 @@ struct mlxfw_dev { #if IS_REACHABLE(CONFIG_MLXFW) int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, -const struct firmware *firmware); +const struct firmware *firmware, +struct netlink_ext_ack *extack); #else static inline int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, -const struct firmware *firmware) +const struct firmware *firmware, +struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c index 240c027e5f07..588d9a9c08c9 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c @@ -40,7 +40,8 @@ static const char * const mlxfw_fsm_state_err_str[] = { }; static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, - enum mlxfw_fsm_state fsm_state) + enum mlxfw_fsm_state fsm_state, + struct netlink_ext_ack *extack) { enum mlxfw_fsm_state_err fsm_state_err; enum mlxfw_fsm_state curr_fsm_state; @@ -57,11 +58,13 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) { pr_err("Firmware flash failed: %s\n", mlxfw_fsm_state_err_str[fsm_state_err]); + NL_SET_ERR_MSG_MOD(extack, "Firmware flash failed"); return -EINVAL; } if (curr_fsm_state != fsm_state) { if (--times == 0) {
[patch net-next 4/7] devlink: allow driver to update progress of flash update
From: Jiri Pirko Introduce a function to be called from drivers during flash. It sends notification to userspace about flash update progress. Signed-off-by: Jiri Pirko --- include/net/devlink.h| 8 +++ include/uapi/linux/devlink.h | 5 ++ net/core/devlink.c | 102 +++ 3 files changed, 115 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index 151eb930d329..8f65356132be 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -741,6 +741,14 @@ void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +void devlink_flash_update_begin_notify(struct devlink *devlink); +void devlink_flash_update_end_notify(struct devlink *devlink); +void devlink_flash_update_status_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total); + #if IS_ENABLED(CONFIG_NET_DEVLINK) void devlink_compat_running_version(struct net_device *dev, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5bb4ea67d84f..5287b42c181f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -104,6 +104,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, DEVLINK_CMD_FLASH_UPDATE, + DEVLINK_CMD_FLASH_UPDATE_END, /* notification only */ + DEVLINK_CMD_FLASH_UPDATE_STATUS,/* notification only */ /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -331,6 +333,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,/* string */ DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,/* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, /* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 9716a7f382cb..963178d32dda 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2673,6 +2673,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_flash_update_fill(struct sk_buff *msg, + struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, unsigned long total) +{ + void *hdr; + + hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS) + goto out; + + if (status_msg && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, + status_msg)) + goto nla_put_failure; + if (component && + nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, + component)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + done, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + total, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + +out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void __devlink_flash_update_notify(struct devlink *devlink, + enum devlink_command cmd, + const char *status_msg, + const char *component, + unsigned long done, + unsigned long total) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && + cmd != DEVLINK_CMD_FLASH_UPDATE_END && + cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, + component, done, total); + if (err) + goto out_free_msg; + + genlmsg_multicast_netns(&devlink_nl_fam
[patch net-next 6/7] mlxsw: Implement flash update status notifications
From: Jiri Pirko Implement mlxfw status_notify op by passing notification down to devlink. Also notify about flash update begin and end. Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 18 +- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 639bb5778ff3..5ac893d9dd12 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -294,6 +294,19 @@ static void mlxsw_sp_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle) mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl); } +static void mlxsw_sp_status_notify(struct mlxfw_dev *mlxfw_dev, + const char *msg, const char *comp_name, + u32 done_bytes, u32 total_bytes) +{ + struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev = + container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev); + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp; + + devlink_flash_update_status_notify(priv_to_devlink(mlxsw_sp->core), + msg, comp_name, + done_bytes, total_bytes); +} + static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = { .component_query= mlxsw_sp_component_query, .fsm_lock = mlxsw_sp_fsm_lock, @@ -303,7 +316,8 @@ static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = { .fsm_activate = mlxsw_sp_fsm_activate, .fsm_query_state= mlxsw_sp_fsm_query_state, .fsm_cancel = mlxsw_sp_fsm_cancel, - .fsm_release= mlxsw_sp_fsm_release + .fsm_release= mlxsw_sp_fsm_release, + .status_notify = mlxsw_sp_status_notify, }; static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp, @@ -321,8 +335,10 @@ static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp, int err; mlxsw_core_fw_flash_start(mlxsw_sp->core); + devlink_flash_update_begin_notify(priv_to_devlink(mlxsw_sp->core)); err = mlxfw_firmware_flash(&mlxsw_sp_mlxfw_dev.mlxfw_dev, firmware, extack); + devlink_flash_update_end_notify(priv_to_devlink(mlxsw_sp->core)); mlxsw_core_fw_flash_end(mlxsw_sp->core); return err; -- 2.17.2
[patch net-next 5/7] mlxfw: Introduce status_notify op and call it to notify about the status
From: Jiri Pirko Add new op status_notify which is called to update the user about flashing status. Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxfw/mlxfw.h | 4 .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c | 24 +++ 2 files changed, 28 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h index 83286b90593f..c50e74ab02c4 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h @@ -58,6 +58,10 @@ struct mlxfw_dev_ops { void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle); + + void (*status_notify)(struct mlxfw_dev *mlxfw_dev, + const char *msg, const char *comp_name, + u32 done_bytes, u32 total_bytes); }; struct mlxfw_dev { diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c index 588d9a9c08c9..6a18ec05181a 100644 --- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c +++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c @@ -39,6 +39,16 @@ static const char * const mlxfw_fsm_state_err_str[] = { "unknown error" }; +static void mlxfw_status_notify(struct mlxfw_dev *mlxfw_dev, + const char *msg, const char *comp_name, + u32 done_bytes, u32 total_bytes) +{ + if (!mlxfw_dev->ops->status_notify) + return; + mlxfw_dev->ops->status_notify(mlxfw_dev, msg, comp_name, + done_bytes, total_bytes); +} + static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle, enum mlxfw_fsm_state fsm_state, struct netlink_ext_ack *extack) @@ -85,11 +95,14 @@ static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, u16 comp_max_write_size; u8 comp_align_bits; u32 comp_max_size; + char comp_name[8]; u16 block_size; u8 *block_ptr; u32 offset; int err; + sprintf(comp_name, "%u", comp->index); + err = mlxfw_dev->ops->component_query(mlxfw_dev, comp->index, &comp_max_size, &comp_align_bits, &comp_max_write_size); @@ -108,6 +121,7 @@ static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, comp_align_bits); pr_debug("Component update\n"); + mlxfw_status_notify(mlxfw_dev, "Updating component", comp_name, 0, 0); err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle, comp->index, comp->data_size); @@ -120,6 +134,8 @@ static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, goto err_out; pr_debug("Component download\n"); + mlxfw_status_notify(mlxfw_dev, "Downloading component", + comp_name, 0, comp->data_size); for (offset = 0; offset < MLXFW_ALIGN_UP(comp->data_size, comp_align_bits); offset += comp_max_write_size) { @@ -131,9 +147,13 @@ static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, offset); if (err) goto err_out; + mlxfw_status_notify(mlxfw_dev, "Downloading component", + comp_name, offset + block_size, + comp->data_size); } pr_debug("Component verify\n"); + mlxfw_status_notify(mlxfw_dev, "Verifying component", comp_name, 0, 0); err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle, comp->index); if (err) @@ -203,6 +223,8 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, return PTR_ERR(mfa2_file); pr_info("Initialize firmware flash process\n"); + mlxfw_status_notify(mlxfw_dev, "Initializing firmware flash process", + NULL, 0, 0); err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle); if (err) { pr_err("Could not lock the firmware FSM\n"); @@ -220,6 +242,7 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, goto err_flash_components; pr_debug("Activate image\n"); + mlxfw_status_notify(mlxfw_dev, "Activating image", NULL, 0, 0); err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle); if (err) { pr_err("Could not activate the downloaded image\n"); @@ -236,6 +259,7 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev, mlxfw_dev->ops->fsm_rele
[patch net-next 7/7] netdevsim: implement fake flash updating with notifications
From: Jiri Pirko Signed-off-by: Jiri Pirko --- drivers/net/netdevsim/dev.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index b509b941d5ca..c15b86f9cd2b 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -220,8 +220,43 @@ static int nsim_dev_reload(struct devlink *devlink, return 0; } +#define NSIM_DEV_FLASH_SIZE 50 +#define NSIM_DEV_FLASH_CHUNK_SIZE 1000 +#define NSIM_DEV_FLASH_CHUNK_TIME_MS 10 + +static int nsim_dev_flash_update(struct devlink *devlink, const char *file_name, +const char *component, +struct netlink_ext_ack *extack) +{ + int i; + + devlink_flash_update_begin_notify(devlink); + + devlink_flash_update_status_notify(devlink, "Preparing to flash", + component, 0, 0); + for (i = 0; i < NSIM_DEV_FLASH_SIZE / NSIM_DEV_FLASH_CHUNK_SIZE; i++) { + devlink_flash_update_status_notify(devlink, "Flashing", + component, + i * NSIM_DEV_FLASH_CHUNK_SIZE, + NSIM_DEV_FLASH_SIZE); + msleep(NSIM_DEV_FLASH_CHUNK_TIME_MS); + } + devlink_flash_update_status_notify(devlink, "Flashing", + component, + NSIM_DEV_FLASH_SIZE, + NSIM_DEV_FLASH_SIZE); + + devlink_flash_update_status_notify(devlink, "Flashing done", + component, 0, 0); + + devlink_flash_update_end_notify(devlink); + + return 0; +} + static const struct devlink_ops nsim_dev_devlink_ops = { .reload = nsim_dev_reload, + .flash_update = nsim_dev_flash_update, }; static struct nsim_dev * -- 2.17.2
[patch iproute2 2/3] devlink: implement flash update status monitoring
From: Jiri Pirko Kernel sends notifications about flash update status, so implement these messages for monitoring. Signed-off-by: Jiri Pirko --- devlink/devlink.c | 43 +++ 1 file changed, 43 insertions(+) diff --git a/devlink/devlink.c b/devlink/devlink.c index 436935f88bda..55cbc01189db 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -414,6 +414,10 @@ static const enum mnl_attr_data_type devlink_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT] = MNL_TYPE_U64, [DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS] = MNL_TYPE_U64, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = MNL_TYPE_U64, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = MNL_TYPE_STRING, + [DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG] = MNL_TYPE_STRING, + [DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE] = MNL_TYPE_U64, + [DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL] = MNL_TYPE_U64, }; static int attr_cb(const struct nlattr *attr, void *data) @@ -3712,6 +3716,9 @@ static const char *cmd_name(uint8_t cmd) case DEVLINK_CMD_REGION_SET: return "set"; case DEVLINK_CMD_REGION_NEW: return "new"; case DEVLINK_CMD_REGION_DEL: return "del"; + case DEVLINK_CMD_FLASH_UPDATE: return "begin"; + case DEVLINK_CMD_FLASH_UPDATE_END: return "end"; + case DEVLINK_CMD_FLASH_UPDATE_STATUS: return "status"; default: return ""; } } @@ -3740,6 +3747,10 @@ static const char *cmd_obj(uint8_t cmd) case DEVLINK_CMD_REGION_NEW: case DEVLINK_CMD_REGION_DEL: return "region"; + case DEVLINK_CMD_FLASH_UPDATE: + case DEVLINK_CMD_FLASH_UPDATE_END: + case DEVLINK_CMD_FLASH_UPDATE_STATUS: + return "flash"; default: return ""; } } @@ -3764,6 +3775,29 @@ static bool cmd_filter_check(struct dl *dl, uint8_t cmd) return false; } +static void pr_out_flash_update(struct dl *dl, struct nlattr **tb) +{ + __pr_out_handle_start(dl, tb, true, false); + + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG]) + pr_out_str(dl, "msg", + mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG])); + + if (tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]) + pr_out_str(dl, "component", + mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT])); + + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE]) + pr_out_u64(dl, "done", + mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE])); + + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL]) + pr_out_u64(dl, "total", + mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL])); + + pr_out_handle_end(dl); +} + static void pr_out_region(struct dl *dl, struct nlattr **tb); static int cmd_mon_show_cb(const struct nlmsghdr *nlh, void *data) @@ -3820,6 +3854,15 @@ static int cmd_mon_show_cb(const struct nlmsghdr *nlh, void *data) pr_out_mon_header(genl->cmd); pr_out_region(dl, tb); break; + case DEVLINK_CMD_FLASH_UPDATE: /* fall through */ + case DEVLINK_CMD_FLASH_UPDATE_END: /* fall through */ + case DEVLINK_CMD_FLASH_UPDATE_STATUS: + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME]) + return MNL_CB_ERROR; + pr_out_mon_header(genl->cmd); + pr_out_flash_update(dl, tb); + break; } return MNL_CB_OK; } -- 2.17.2
[patch iproute2 3/3] devlink: implement flash status monitoring
From: Jiri Pirko Listen to status notifications coming from kernel during flashing and put them on stdout to inform user about the status. Signed-off-by: Jiri Pirko --- devlink/devlink.c | 209 +- devlink/mnlg.c| 5 ++ devlink/mnlg.h| 1 + 3 files changed, 211 insertions(+), 4 deletions(-) diff --git a/devlink/devlink.c b/devlink/devlink.c index 55cbc01189db..8078e8801e98 100644 --- a/devlink/devlink.c +++ b/devlink/devlink.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "SNAPSHOT.h" #include "list.h" @@ -68,6 +69,12 @@ static int g_new_line_count; g_new_line_count = 0; \ } while (0) +#define pr_out_tty(args...)\ + do {\ + if (isatty(STDOUT_FILENO)) \ + fprintf(stdout, ##args);\ + } while (0) + static int g_indent_level; static bool g_indent_newline; #define INDENT_STR_STEP 2 @@ -113,9 +120,8 @@ static int _mnlg_socket_recv_run(struct mnlg_socket *nlg, return 0; } -static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg, - const struct nlmsghdr *nlh, - mnl_cb_t data_cb, void *data) +static int _mnlg_socket_send(struct mnlg_socket *nlg, +const struct nlmsghdr *nlh) { int err; @@ -124,6 +130,18 @@ static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg, pr_err("Failed to call mnlg_socket_send\n"); return -errno; } + return 0; +} + +static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg, + const struct nlmsghdr *nlh, + mnl_cb_t data_cb, void *data) +{ + int err; + + err = _mnlg_socket_send(nlg, nlh); + if (err) + return err; return _mnlg_socket_recv_run(nlg, data_cb, data); } @@ -2697,9 +2715,151 @@ static void cmd_dev_flash_help(void) pr_err("Usage: devlink dev flash DEV file PATH [ component NAME ]\n"); } + +struct cmd_dev_flash_status_ctx { + struct dl *dl; + char *last_msg; + char *last_component; + uint8_t not_first:1, + last_pc:1, + received_end:1, + flash_done:1; +}; + +static int nullstrcmp(const char *str1, const char *str2) +{ + if (str1 && str2) + return strcmp(str1, str2); + if (!str1 && !str2) + return 0; + return str1 ? 1 : -1; +} + +static int cmd_dev_flash_status_cb(const struct nlmsghdr *nlh, void *data) +{ + struct cmd_dev_flash_status_ctx *ctx = data; + struct dl_opts *opts = &ctx->dl->opts; + struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh); + struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {}; + const char *component = NULL; + uint64_t done = 0, total = 0; + const char *msg = NULL; + const char *bus_name; + const char *dev_name; + + if (genl->cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS && + genl->cmd != DEVLINK_CMD_FLASH_UPDATE_END) + return MNL_CB_STOP; + + mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb); + if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME]) + return MNL_CB_ERROR; + bus_name = mnl_attr_get_str(tb[DEVLINK_ATTR_BUS_NAME]); + dev_name = mnl_attr_get_str(tb[DEVLINK_ATTR_DEV_NAME]); + if (strcmp(bus_name, opts->bus_name) || + strcmp(dev_name, opts->dev_name)) + return MNL_CB_ERROR; + + if (genl->cmd == DEVLINK_CMD_FLASH_UPDATE_END && ctx->not_first) { + pr_out("\n"); + free(ctx->last_msg); + free(ctx->last_component); + ctx->received_end = 1; + return MNL_CB_STOP; + } + + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG]) + msg = mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG]); + if (tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]) + component = mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]); + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE]) + done = mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE]); + if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL]) + total = mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL]); + + if (!nullstrcmp(msg, ctx->last_msg) && + !nullstrcmp(component, ctx->last_component) && + ctx->last_pc && ctx->not_first) { + pr_out_tty("\b\b\b\b\b"); /* clean percentage */ + } else { + if (ctx->not_first) + pr_out("\n"); + if (component) { + pr_out("[%s] ", component); + free(ctx->last_component); +
[patch iproute2 1/3] header update
From: Jiri Pirko Signed-off-by: Jiri Pirko --- include/uapi/linux/devlink.h | 5 + 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 3b6a9e6be3ac..6544824a0b97 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -104,6 +104,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, DEVLINK_CMD_FLASH_UPDATE, + DEVLINK_CMD_FLASH_UPDATE_END, /* notification only */ + DEVLINK_CMD_FLASH_UPDATE_STATUS,/* notification only */ /* add new commands above here */ __DEVLINK_CMD_MAX, @@ -331,6 +333,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,/* string */ DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,/* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG, /* string */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */ /* add new attributes above here, update the policy in devlink.c */ -- 2.17.2
[PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options
The networking PHYs have a number of available device tree properties that can be used in their device tree node. Add a YAML schemas for those. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +- Documentation/devicetree/bindings/net/phy.txt | 80 +- 2 files changed, 149 insertions(+), 79 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml new file mode 100644 index ..eb79ee6db977 --- /dev/null +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/ethernet-phy.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Ethernet PHY Generic Binding + +maintainers: + - David S. Miller + +properties: + $nodename: +pattern: "^ethernet-phy(@[a-f0-9])?$" + + compatible: +oneOf: + - const: ethernet-phy-ieee802.3-c22 +description: PHYs that implement IEEE802.3 clause 22 + - const: ethernet-phy-ieee802.3-c45 +description: PHYs that implement IEEE802.3 clause 45 + - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$" +description: + The first group of digits is the 16 bit Phy Identifier 1 + register, this is the chip vendor OUI bits 3:18. The + second group of digits is the Phy Identifier 2 register, + this is the chip vendor OUI bits 19:24, followed by 10 + bits of a vendor specific ID. + + reg: +maxItems: 1 +minimum: 0 +maximum: 31 +description: + The ID number for the PHY. + + interrupts: +maxItems: 1 + + max-speed: +enum: + - 10 + - 100 + - 1000 +description: + Maximum PHY supported speed in Mbits / seconds. + + broken-turn-around: +$ref: /schemas/types.yaml#definitions/flag +description: + If set, indicates the PHY device does not correctly release + the turn around line low at the end of a MDIO transaction. + + enet-phy-lane-swap: +$ref: /schemas/types.yaml#definitions/flag +description: + If set, indicates the PHY will swap the TX/RX lanes to + compensate for the board being designed with the lanes + swapped. + + eee-broken-100tx: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + eee-broken-1000t: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + eee-broken-10gt: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + eee-broken-1000kx: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + eee-broken-10gkx4: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + eee-broken-10gkr: +$ref: /schemas/types.yaml#definitions/flag +description: + Mark the corresponding energy efficient ethernet mode as + broken and request the ethernet to stop advertising it. + + phy-is-integrated: +$ref: /schemas/types.yaml#definitions/flag +description: + If set, indicates that the PHY is integrated into the same + physical package as the Ethernet MAC. If needed, muxers + should be configured to ensure the integrated PHY is + used. The absence of this property indicates the muxers + should be configured so that the external PHY is used. + + resets: +maxItems: 1 + + reset-names: +const: phy + + reset-gpios: +description: + The GPIO phandle and specifier for the PHY reset signal. + + reset-assert-us: +description: + Delay after the reset was asserted in microseconds. If this + property is missing the delay will be skipped. + + reset-deassert-us: +description: + Delay after the reset was deasserted in microseconds. If + this property is missing the delay will be skipped. + +required: + - reg + - interrupts + +examples: + - | +ethernet { +#address-cells = <1>; +#size-cells = <0>; + +ethernet-phy@0 { +compatible = "ethernet-phy-id0141.0e90", "ethernet-phy-ieee802.3-c22"; +interrupt-parent = <&PIC>; +interrupts = <35 1>; +reg = <0>; + +resets = <&rst 8>; +r
[PATCH 3/8] dt-bindings: net: phy: The interrupt property is not mandatory
Unlike what was initially claimed in the PHY binding, the interrupt property of a PHY can be omitted, and the OS will turn to polling instead. Document that. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/ethernet-phy.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml index eb79ee6db977..d2cc4b46f6dc 100644 --- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -125,7 +125,6 @@ properties: required: - reg - - interrupts examples: - | -- git-series 0.9.1
[PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options
The Ethernet controllers have a good number of generic options that can be needed in a device tree. Add a YAML schemas for those. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/ethernet-controller.yaml | 197 +++- Documentation/devicetree/bindings/net/ethernet.txt | 68 +-- Documentation/devicetree/bindings/net/fixed-link.txt | 55 +-- 3 files changed, 199 insertions(+), 121 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/ethernet-controller.yaml diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml new file mode 100644 index ..1c6e9e755481 --- /dev/null +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -0,0 +1,197 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/ethernet-controller.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Ethernet Controller Generic Binding + +maintainers: + - David S. Miller + +properties: + $nodename: +pattern: "^ethernet(@.*)?$" + + local-mac-address: +allOf: + - $ref: /schemas/types.yaml#definitions/uint8-array + - minItems: 6 +maxItems: 6 +description: + Specifies the MAC address that was assigned to the network device. + + mac-address: +allOf: + - $ref: /schemas/types.yaml#definitions/uint8-array + - minItems: 6 +maxItems: 6 +description: + Specifies the MAC address that was last used by the boot + program; should be used in cases where the MAC address assigned + to the device by the boot program is different from the + local-mac-address property. + + max-frame-size: +$ref: /schemas/types.yaml#definitions/uint32 +description: + Maximum transfer unit (IEEE defined MTU), rather than the + maximum frame size (there\'s contradiction in the Devicetree + Specification). + + max-speed: +$ref: /schemas/types.yaml#definitions/uint32 +description: + Specifies maximum speed in Mbit/s supported by the device. + + nvmem-cells: +maxItems: 1 +description: + Reference to an nvmem node for the MAC address + + nvmem-cells-names: +const: mac-address + + phy-connection-type: +description: + Operation mode of the PHY interface +oneOf: + - const: internal +description: + there is not a standard bus between the MAC and the PHY, + something proprietary is being used to embed the PHY in the + MAC. + - const: mii + - const: gmii + - const: sgmii + - const: qsgmii + - const: tbi + - const: rev-mii + - const: rmii + - const: rgmii +description: + RX and TX delays are added by the MAC when required + - const: rgmii-id +description: + RGMII with internal RX and TX delays provided by the PHY, + the MAC should not add the RX or TX delays in this case + - const: rgmii-rxid +description: + RGMII with internal RX delay provided by the PHY, the MAC + should not add an RX delay in this case + - const: rgmii-txid +description: + RGMII with internal TX delay provided by the PHY, the MAC + should not add an TX delay in this case + - const: rtbi + - const: smii + - const: xgmii + - const: trgmii + - const: 1000base-x + - const: 2500base-x + - const: rxaui + - const: xaui + - const: 10gbase-kr +description: + 10GBASE-KR, XFI, SFI + + phy-mode: +$ref: "#/properties/phy-connection-type" +description: + Deprecated in favor of phy-connection-type + + phy-handle: +$ref: /schemas/types.yaml#definitions/phandle +description: + Specifies a reference to a node representing a PHY device. + + phy: +$ref: "#/properties/phy-handle" +description: + Deprecated in favor of phy-handle + + phy-device: +$ref: "#/properties/phy-handle" +description: + Deprecated in favor of phy-handle + + rx-fifo-depth: +$ref: /schemas/types.yaml#definitions/uint32 +description: + The size of the controller\'s receive fifo in bytes. This is used + for components that can have configurable receive fifo sizes, + and is useful for determining certain configuration settings + such as flow control thresholds. + + tx-fifo-depth: +$ref: /schemas/types.yaml#definitions/uint32 +description: + The size of the controller\'s transmit fifo in bytes. This + is used for components that can have configurable fifo sizes. + + managed: +allOf: + - $ref: /schemas/types.yaml#definitions/string + - default: auto +enum: + - auto + - in-band-status +description: + Specifies the PHY management type. If auto is set and fixed-link + is not specified, it us
[PATCH 4/8] dt-bindings: net: sun4i-emac: Convert the binding to a schemas
Switch our Allwinner A10 EMAC controller binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml | 55 +++ Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt | 19 --- 2 files changed, 55 insertions(+), 19 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml delete mode 100644 Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml new file mode 100644 index ..b5d82d0a59d8 --- /dev/null +++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-emac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 EMAC Ethernet Controller Device Tree Bindings + +allOf: + - $ref: "ethernet-controller.yaml#" + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + compatible: +const: allwinner,sun4i-a10-emac + + reg: +maxItems: 1 + + interrupts: +maxItems: 1 + + clocks: +maxItems: 1 + + allwinner,sram: +description: Phandle to the device SRAM +$ref: /schemas/types.yaml#/definitions/phandle-array + +required: + - compatible + - reg + - interrupts + - clocks + - phy + - allwinner,sram + +examples: + - | +emac: ethernet@1c0b000 { +compatible = "allwinner,sun4i-a10-emac"; +reg = <0x01c0b000 0x1000>; +interrupts = <55>; +clocks = <&ahb_gates 17>; +phy = <&phy0>; +}; + +# FIXME: We should set it, but it would report all the generic +# properties as additional properties. +# additionalProperties: false + +... diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt deleted file mode 100644 index e98118aef5f6.. --- a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt +++ /dev/null @@ -1,19 +0,0 @@ -* Allwinner EMAC ethernet controller - -Required properties: -- compatible: should be "allwinner,sun4i-a10-emac" (Deprecated: - "allwinner,sun4i-emac") -- reg: address and length of the register set for the device. -- interrupts: interrupt for the device -- phy: see ethernet.txt file in the same directory. -- clocks: A phandle to the reference clock for this device - -Example: - -emac: ethernet@1c0b000 { - compatible = "allwinner,sun4i-a10-emac"; - reg = <0x01c0b000 0x1000>; - interrupts = <55>; - clocks = <&ahb_gates 17>; - phy = <&phy0>; -}; -- git-series 0.9.1
[PATCH 5/8] dt-bindings: net: sun4i-mdio: Convert the binding to a schemas
Switch our Allwinner A10 MDIO controller binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml | 55 +++ Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt | 27 --- 2 files changed, 55 insertions(+), 27 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml delete mode 100644 Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml new file mode 100644 index ..32c0fdc57d35 --- /dev/null +++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-mdio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner A10 MDIO Controller Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#address-cells": +const: 1 + + "#size-cells": +const: 0 + + compatible: +enum: + - allwinner,sun4i-a10-mdio + + # Deprecated + - allwinner,sun4i-mdio + + reg: +maxItems: 1 + + phy-supply: +description: PHY regulator + +required: + - compatible + - reg + +examples: + - | +mdio@1c0b080 { +compatible = "allwinner,sun4i-a10-mdio"; +reg = <0x01c0b080 0x14>; +#address-cells = <1>; +#size-cells = <0>; +phy-supply = <®_emac_3v3>; + +phy0: ethernet-phy@0 { +reg = <0>; +}; +}; + +# FIXME: We should set it, but it would report all the generic +# properties as additional properties. +# additionalProperties: false + +... diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt deleted file mode 100644 index ab5b8613b0ef.. --- a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt +++ /dev/null @@ -1,27 +0,0 @@ -* Allwinner A10 MDIO Ethernet Controller interface - -Required properties: -- compatible: should be "allwinner,sun4i-a10-mdio" - (Deprecated: "allwinner,sun4i-mdio"). -- reg: address and length of the register set for the device. - -Optional properties: -- phy-supply: phandle to a regulator if the PHY needs one - -Example at the SoC level: -mdio@1c0b080 { - compatible = "allwinner,sun4i-a10-mdio"; - reg = <0x01c0b080 0x14>; - #address-cells = <1>; - #size-cells = <0>; -}; - -And at the board level: - -mdio@1c0b080 { - phy-supply = <®_emac_3v3>; - - phy0: ethernet-phy@0 { - reg = <0>; - }; -}; -- git-series 0.9.1
[PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++- Documentation/devicetree/bindings/net/stmmac.txt | 179 +-- 2 files changed, 345 insertions(+), 178 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml new file mode 100644 index ..be3ada5121e1 --- /dev/null +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/snps,dwmac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Synopsys DesignWare MAC Device Tree Bindings + +maintainers: + - Alexandre Torgue + - Giuseppe Cavallaro + - Jose Abreu + +properties: + compatible: +oneOf: + - const: snps,dwmac + - const: snps,dwmac-3.50a + - const: snps,dwmac-3.610 + - const: snps,dwmac-3.70a + - const: snps,dwmac-3.710 + - const: snps,dwmac-4.00 + - const: snps,dwmac-4.10a + - const: snps,dwxgmac + - const: snps,dwxgmac-2.10 + - const: st,spear600-gmac +description: Deprecated + + reg: +maxItems: 1 + + interrupts: +minItems: 1 +maxItems: 3 +items: + - description: Combined signal for various interrupt events + - description: The interrupt to manage the remote wake-up packet detection + - description: The interrupt that occurs when Rx exits the LPI state + + interrupt-names: +minItems: 1 +maxItems: 3 +items: + - const: macirq + - const: eth_wake_irq + - const: eth_lpi + + clocks: +minItems: 1 +maxItems: 3 +items: + - description: GMAC main clock + - description: Peripheral registers interface clock + - description: + PTP reference clock. This clock is used for programming the + Timestamp Addend Register. If not passed then the system + clock will be used and this is fine on some platforms. + + clock-names: +additionalItems: true +contains: + enum: +- stmmaceth +- pclk +- ptp_ref + + resets: +maxItems: 1 +description: + MAC Reset signal. + + reset-names: +const: stmmaceth + + snps,axi-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + AXI BUS Mode parameters. Phandle to a node that can contain the + following properties +* snps,lpi_en, enable Low Power Interface +* snps,xit_frm, unlock on WoL +* snps,wr_osr_lmt, max write outstanding req. limit +* snps,rd_osr_lmt, max read outstanding req. limit +* snps,kbbe, do not cross 1KiB boundary. +* snps,blen, this is a vector of supported burst length. +* snps,fb, fixed-burst +* snps,mb, mixed-burst +* snps,rb, rebuild INCRx Burst + + snps,mtl-rx-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + Multiple RX Queues parameters. Phandle to a node that can + contain the following properties +* snps,rx-queues-to-use, number of RX queues to be used in the + driver +* Choose one of these RX scheduling algorithms + * snps,rx-sched-sp, Strict priority + * snps,rx-sched-wsp, Weighted Strict priority +* For each RX queue + * Choose one of these modes +* snps,dcb-algorithm, Queue to be enabled as DCB +* snps,avb-algorithm, Queue to be enabled as AVB + * snps,map-to-dma-channel, Channel to map + * Specifiy specific packet routing +* snps,route-avcp, AV Untagged Control packets +* snps,route-ptp, PTP Packets +* snps,route-dcbcp, DCB Control Packets +* snps,route-up, Untagged Packets +* snps,route-multi-broad, Multicast & Broadcast Packets + * snps,priority, RX queue priority (Range 0x0 to 0xF) + + snps,mtl-tx-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + Multiple TX Queues parameters. Phandle to a node that can + contain the following properties +* snps,tx-queues-to-use, number of TX queues to be used in the + driver +* Choose one of these TX scheduling algorithms + * snps,tx-sched-wrr, Weighted Round Robin + * snps,tx-sched-wfq, Weighted Fair Queuing + * snps,tx-sched-dwrr, Deficit Weighted Round Robin + * snps,tx-sched-sp, Strict priority +* For each TX queue + * snps,weight, TX queue weight (if using a DCB weight +algorithm) + * Choose one of these modes +* snps,dcb-algorithm, TX queue will be working in DCB +* snps,avb-algorithm, TX queue will
[PATCH 8/8] dt-bindings: net: sun8i-emac: Convert the binding to a schemas
Switch our Allwinner H3 EMAC controller binding to a YAML schema to enable the DT validation. Since that controller is based on a Synopsys IP, let's add the validation to that schemas with a bunch of conditionals. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 201 +-- Documentation/devicetree/bindings/net/snps,dwmac.yaml | 342 +++- 2 files changed, 342 insertions(+), 201 deletions(-) delete mode 100644 Documentation/devicetree/bindings/net/dwmac-sun8i.txt diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt deleted file mode 100644 index 54c66d0611cb.. --- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt +++ /dev/null @@ -1,201 +0,0 @@ -* Allwinner sun8i GMAC ethernet controller - -This device is a platform glue layer for stmmac. -Please see stmmac.txt for the other unchanged properties. - -Required properties: -- compatible: must be one of the following string: - "allwinner,sun8i-a83t-emac" - "allwinner,sun8i-h3-emac" - "allwinner,sun8i-r40-gmac" - "allwinner,sun8i-v3s-emac" - "allwinner,sun50i-a64-emac" - "allwinner,sun50i-h6-emac", "allwinner-sun50i-a64-emac" -- reg: address and length of the register for the device. -- interrupts: interrupt for the device -- interrupt-names: must be "macirq" -- clocks: A phandle to the reference clock for this device -- clock-names: must be "stmmaceth" -- resets: A phandle to the reset control for this device -- reset-names: must be "stmmaceth" -- phy-mode: See ethernet.txt -- phy-handle: See ethernet.txt -- syscon: A phandle to the device containing the EMAC or GMAC clock register - -Optional properties: -- allwinner,tx-delay-ps: TX clock delay chain value in ps. -Range is 0-700. Default is 0. -Unavailable for allwinner,sun8i-r40-gmac -- allwinner,rx-delay-ps: RX clock delay chain value in ps. -Range is 0-3100. Default is 0. -Range is 0-700 for allwinner,sun8i-r40-gmac -Both delay properties need to be a multiple of 100. They control the -clock delay for external RGMII PHY. They do not apply to the internal -PHY or external non-RGMII PHYs. - -Optional properties for the following compatibles: - - "allwinner,sun8i-h3-emac", - - "allwinner,sun8i-v3s-emac": -- allwinner,leds-active-low: EPHY LEDs are active low - -Required child node of emac: -- mdio bus node: should be named mdio with compatible "snps,dwmac-mdio" - -Required properties of the mdio node: -- #address-cells: shall be 1 -- #size-cells: shall be 0 - -The device node referenced by "phy" or "phy-handle" must be a child node -of the mdio node. See phy.txt for the generic PHY bindings. - -The following compatibles require that the emac node have a mdio-mux child -node called "mdio-mux": - - "allwinner,sun8i-h3-emac" - - "allwinner,sun8i-v3s-emac": -Required properties for the mdio-mux node: - - compatible = "allwinner,sun8i-h3-mdio-mux" - - mdio-parent-bus: a phandle to EMAC mdio - - one child mdio for the integrated mdio with the compatible -"allwinner,sun8i-h3-mdio-internal" - - one child mdio for the external mdio if present (V3s have none) -Required properties for the mdio-mux children node: - - reg: 1 for internal MDIO bus, 2 for external MDIO bus - -The following compatibles require a PHY node representing the integrated -PHY, under the integrated MDIO bus node if an mdio-mux node is used: - - "allwinner,sun8i-h3-emac", - - "allwinner,sun8i-v3s-emac": - -Additional information regarding generic multiplexer properties can be found -at Documentation/devicetree/bindings/net/mdio-mux.txt - -Required properties of the integrated phy node: -- clocks: a phandle to the reference clock for the EPHY -- resets: a phandle to the reset control for the EPHY -- Must be a child of the integrated mdio - -Example with integrated PHY: -emac: ethernet@1c0b000 { - compatible = "allwinner,sun8i-h3-emac"; - syscon = <&syscon>; - reg = <0x01c0b000 0x104>; - interrupts = ; - interrupt-names = "macirq"; - resets = <&ccu RST_BUS_EMAC>; - reset-names = "stmmaceth"; - clocks = <&ccu CLK_BUS_EMAC>; - clock-names = "stmmaceth"; - - phy-handle = <&int_mii_phy>; - phy-mode = "mii"; - allwinner,leds-active-low; - - mdio: mdio { - #address-cells = <1>; - #size-cells = <0>; - compatible = "snps,dwmac-mdio"; - }; - - mdio-mux { - compatible = "mdio-mux", "allwinner,sun8i-h3-mdio-mux"; - #address-cells = <1>; - #size-cells = <0>; - - mdio-parent-bus = <&mdio>; - - int_mdio: mdio@1 { - compatible = "allwinner,sun8i-h3-mdio-internal"; - reg = <1>; -
[PATCH 7/8] dt-bindings: net: sun7i-gmac: Convert the binding to a schemas
Switch our Allwinner A20 GMAC controller binding to a YAML schema to enable the DT validation. Since that controller is based on a Synopsys IP, let's add the validation to that schemas with a bunch of conditionals. Signed-off-by: Maxime Ripard --- Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt | 27 --- Documentation/devicetree/bindings/net/snps,dwmac.yaml | 45 + 2 files changed, 45 insertions(+), 27 deletions(-) delete mode 100644 Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt deleted file mode 100644 index 8b3f953656e3.. --- a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt +++ /dev/null @@ -1,27 +0,0 @@ -* Allwinner GMAC ethernet controller - -This device is a platform glue layer for stmmac. -Please see stmmac.txt for the other unchanged properties. - -Required properties: - - compatible: Should be "allwinner,sun7i-a20-gmac" - - clocks: Should contain the GMAC main clock, and tx clock - The tx clock type should be "allwinner,sun7i-a20-gmac-clk" - - clock-names: Should contain the clock names "stmmaceth", - and "allwinner_gmac_tx" - -Optional properties: -- phy-supply: phandle to a regulator if the PHY needs one - -Examples: - - gmac: ethernet@1c5 { - compatible = "allwinner,sun7i-a20-gmac"; - reg = <0x01c5 0x1>, - <0x01c20164 0x4>; - interrupts = <0 85 1>; - interrupt-names = "macirq"; - clocks = <&ahb_gates 49>, <&gmac_tx>; - clock-names = "stmmaceth", "allwinner_gmac_tx"; - phy-mode = "mii"; - }; diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index be3ada5121e1..d213c32ef153 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -14,6 +14,7 @@ maintainers: properties: compatible: oneOf: + - const: allwinner,sun7i-a20-gmac - const: snps,dwmac - const: snps,dwmac-3.50a - const: snps,dwmac-3.610 @@ -232,6 +233,7 @@ allOf: properties: compatible: enum: +- allwinner,sun7i-a20-gmac - snps,dwxgmac - snps,dwxgmac-2.10 - st,spear600-gmac @@ -273,6 +275,37 @@ allOf: Enables the TSO feature otherwise it will be managed by MAC HW capability register. Only for GMAC4 and newer. + - if: + properties: +compatible: + const: allwinner,sun7i-a20-gmac + +then: + properties: +interrupts: + maxItems: 1 + +interrupt-names: + const: macirq + +clocks: + items: +- description: GMAC main clock +- description: TX clock + +clock-names: + items: +- const: stmmaceth +- const: allwinner_gmac_tx + +phy-supply: + description: +PHY regulator + + required: +- clocks +- clock-names + examples: - | stmmac_axi_setup: stmmac-axi-config { @@ -337,6 +370,18 @@ examples: }; }; + - | +gmac: ethernet@1c5 { +compatible = "allwinner,sun7i-a20-gmac"; +reg = <0x01c5 0x1>, + <0x01c20164 0x4>; +interrupts = <0 85 1>; +interrupt-names = "macirq"; +clocks = <&ahb_gates 49>, <&gmac_tx>; +clock-names = "stmmaceth", "allwinner_gmac_tx"; +phy-mode = "mii"; +}; + # FIXME: We should set it, but it would report all the generic # properties as additional properties. # additionalProperties: false -- git-series 0.9.1
RE: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
From: Maxime Ripard Date: Thu, May 23, 2019 at 10:56:49 > Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML > schema to enable the DT validation. > > Signed-off-by: Maxime Ripard How exactly can I see the final results of this ? Do you have any link ? (I'm no expert in YAML at all). Thanks, Jose Miguel Abreu
[PATCH bpf-next v2 0/3] tools: bpftool: add an option for debug output from libbpf and verifier
Hi, This series adds an option to bpftool to make it print additional information via libbpf and the kernel verifier when attempting to load programs. A new API function is added to libbpf in order to pass the log_level from bpftool with the bpf_object__* part of the API. Options for a finer control over the log levels to use for libbpf and the verifier could be added in the future, if desired. v2: - Do not add distinct options for libbpf and verifier logs, just keep the one that sets all log levels to their maximum. Rename the option. - Do not offer a way to pick desired log levels. The choice is "use the option to print all logs" or "stick with the defaults". - Do not export BPF_LOG_* flags to user header. - Update all man pages (most bpftool operations use libbpf and may print libbpf logs). Verifier logs are only used when attempting to load programs for now, so bpftool-prog.rst and bpftool.rst remain the only pages updated in that regard. Previous discussion available at: https://lore.kernel.org/bpf/20190429095227.9745-1-quentin.mon...@netronome.com/ Quentin Monnet (3): tools: bpftool: add -d option to get debug output from libbpf libbpf: add bpf_object__load_xattr() API function to pass log_level tools: bpftool: make -d option print debug output from verifier .../bpf/bpftool/Documentation/bpftool-btf.rst | 4 +++ .../bpftool/Documentation/bpftool-cgroup.rst | 4 +++ .../bpftool/Documentation/bpftool-feature.rst | 4 +++ .../bpf/bpftool/Documentation/bpftool-map.rst | 4 +++ .../bpf/bpftool/Documentation/bpftool-net.rst | 4 +++ .../bpftool/Documentation/bpftool-perf.rst| 4 +++ .../bpftool/Documentation/bpftool-prog.rst| 5 tools/bpf/bpftool/Documentation/bpftool.rst | 4 +++ tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/main.c | 16 ++- tools/bpf/bpftool/main.h | 1 + tools/bpf/bpftool/prog.c | 27 --- tools/lib/bpf/Makefile| 2 +- tools/lib/bpf/libbpf.c| 20 +++--- tools/lib/bpf/libbpf.h| 6 + tools/lib/bpf/libbpf.map | 5 16 files changed, 96 insertions(+), 16 deletions(-) -- 2.17.1
[PATCH bpf-next v2 1/3] tools: bpftool: add -d option to get debug output from libbpf
libbpf has three levels of priority for output messages: warn, info, debug. By default, debug output is not printed to the console. Add a new "--debug" (short name: "-d") option to bpftool to print libbpf logs for all three levels. Internally, we simply use the function provided by libbpf to replace the default printing function by one that prints logs regardless of their level. v2: - Remove the possibility to select the log-levels to use (v1 offered a combination of "warn", "info" and "debug"). - Rename option and offer a short name: -d|--debug. - Add option description to all bpftool manual pages (instead of bpftool-prog.rst only), as all commands use libbpf. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski --- tools/bpf/bpftool/Documentation/bpftool-btf.rst| 4 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 4 .../bpf/bpftool/Documentation/bpftool-feature.rst | 4 tools/bpf/bpftool/Documentation/bpftool-map.rst| 4 tools/bpf/bpftool/Documentation/bpftool-net.rst| 4 tools/bpf/bpftool/Documentation/bpftool-perf.rst | 4 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 tools/bpf/bpftool/Documentation/bpftool.rst| 3 +++ tools/bpf/bpftool/bash-completion/bpftool | 2 +- tools/bpf/bpftool/main.c | 14 +- 10 files changed, 45 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst index 2dbc1413fabd..00668df1bf7a 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst @@ -67,6 +67,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + EXAMPLES **# bpftool btf dump id 1226** diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index ac26876389c2..36807735e2a5 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -113,6 +113,10 @@ OPTIONS -f, --bpffs Show file names of pinned programs. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + EXAMPLES | diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst index 14180e887082..4d08f35034a2 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -73,6 +73,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + SEE ALSO **bpf**\ (2), diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 13ef27b39f20..490b4501cb6e 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -152,6 +152,10 @@ OPTIONS Do not automatically attempt to mount any virtual file system (such as tracefs or BPF virtual file system) when necessary. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + EXAMPLES **# bpftool map show** diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 934580850f42..d8e5237a2085 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -65,6 +65,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + EXAMPLES diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index 0c7576523a21..e252bd0bc434 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -53,6 +53,10 @@ OPTIONS -p, --pretty Generate human-readable JSON output. Implies **-j**. + -d, --debug + Print all logs available from libbpf, including debug-level + information. + EXAMPLES diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index e8118544d118..9a92614569e6 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/
[PATCH bpf-next v2 3/3] tools: bpftool: make -d option print debug output from verifier
The "-d" option is used to require all logs available for bpftool. So far it meant telling libbpf to print even debug-level information. But there is another source of info that can be made more verbose: when we attemt to load programs with bpftool, we can pass a log_level parameter to the verifier in order to control the amount of information that is printed to the console. Reuse the "-d" option to print all information the verifier can tell. At this time, this means logs related to BPF_LOG_LEVEL1, BPF_LOG_LEVEL2 and BPF_LOG_STATS. As mentioned in the discussion on the first version of this set, these macros are internal to the kernel (include/linux/bpf_verifier.h) and are not meant to be part of the stable user API, therefore we simply use the related constants to print whatever we can at this time, without trying to tell users what is log_level1 or what is statistics. Verifier logs are only used when loading programs for now (in the future: for loading BTF objects with bpftool?), so bpftool.rst and bpftool-prog.rst are the only man pages to get the update. v2: - Remove the possibility to select the log levels to use (v1 offered a combination of "log_level1", "log_level2" and "stats"). - The macros from kernel header bpf_verifier.h are not used (and therefore not moved to UAPI header). - In v1 this was a distinct option, but is now merged in the only "-d" switch to activate libbpf and verifier debug-level logs all at the same time. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski --- .../bpftool/Documentation/bpftool-prog.rst| 5 ++-- tools/bpf/bpftool/Documentation/bpftool.rst | 5 ++-- tools/bpf/bpftool/main.c | 2 ++ tools/bpf/bpftool/main.h | 1 + tools/bpf/bpftool/prog.c | 27 --- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 9a92614569e6..228a5c863cc7 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -175,8 +175,9 @@ OPTIONS (such as tracefs or BPF virtual file system) when necessary. -d, --debug - Print all logs available from libbpf, including debug-level - information. + Print all logs available, even debug-level information. This + includes logs from libbpf as well as from the verifier, when + attempting to load programs. EXAMPLES diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 43dba0717953..6a9c52ef84a9 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -67,8 +67,9 @@ OPTIONS (such as tracefs or BPF virtual file system) when necessary. -d, --debug - Print all logs available from libbpf, including debug-level - information. + Print all logs available, even debug-level information. This + includes logs from libbpf as well as from the verifier, when + attempting to load programs. SEE ALSO diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index d74293938a05..4879f6395c7e 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -26,6 +26,7 @@ bool pretty_output; bool json_output; bool show_pinned; bool block_mount; +bool verifier_logs; int bpf_flags; struct pinned_obj_table prog_table; struct pinned_obj_table map_table; @@ -373,6 +374,7 @@ int main(int argc, char **argv) break; case 'd': libbpf_set_print(print_all_levels); + verifier_logs = true; break; default: p_err("unrecognized option '%s'", argv[optind - 1]); diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 3d63feb7f852..28a2a5857e14 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -91,6 +91,7 @@ extern json_writer_t *json_wtr; extern bool json_output; extern bool show_pinned; extern bool block_mount; +extern bool verifier_logs; extern int bpf_flags; extern struct pinned_obj_table prog_table; extern struct pinned_obj_table map_table; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 26336bad0442..1f209c80d906 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -750,10 +750,11 @@ static int do_detach(int argc, char **argv) static int load_with_options(int argc, char **argv, bool first_prog_only) { - enum bpf_attach_type expected_attach_type; - struct bpf_object_open_attr attr = { - .prog_type = BPF_PROG_TYPE_UNSPEC, + struct bpf_object_load_attr load_attr = { 0 }; +
[PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level
libbpf was recently made aware of the log_level attribute for programs, used to specify the level of information expected to be dumped by the verifier. Create an API function to pass additional attributes when loading a bpf_object, so we can set this log_level value in programs when loading them, and so that so that applications relying on libbpf but not calling bpf_prog_load_xattr() can also use that feature. v2: - We are in a new cycle, bump libbpf extraversion number. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski --- tools/lib/bpf/Makefile | 2 +- tools/lib/bpf/libbpf.c | 20 +--- tools/lib/bpf/libbpf.h | 6 ++ tools/lib/bpf/libbpf.map | 5 + 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index a2aceadf68db..9312066a1ae3 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -3,7 +3,7 @@ BPF_VERSION = 0 BPF_PATCHLEVEL = 0 -BPF_EXTRAVERSION = 3 +BPF_EXTRAVERSION = 4 MAKEFLAGS += --no-print-directory diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 197b574406b3..1c6fb7a3201e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -,7 +,7 @@ static bool bpf_program__is_function_storage(struct bpf_program *prog, } static int -bpf_object__load_progs(struct bpf_object *obj) +bpf_object__load_progs(struct bpf_object *obj, int log_level) { size_t i; int err; @@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj) for (i = 0; i < obj->nr_programs; i++) { if (bpf_program__is_function_storage(&obj->programs[i], obj)) continue; + obj->programs[i].log_level = log_level; err = bpf_program__load(&obj->programs[i], obj->license, obj->kern_version); @@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj) return 0; } -int bpf_object__load(struct bpf_object *obj) +int bpf_object__load_xattr(struct bpf_object_load_attr *attr) { + struct bpf_object *obj; int err; + if (!attr) + return -EINVAL; + obj = attr->obj; if (!obj) return -EINVAL; @@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj) CHECK_ERR(bpf_object__create_maps(obj), err, out); CHECK_ERR(bpf_object__relocate(obj), err, out); - CHECK_ERR(bpf_object__load_progs(obj), err, out); + CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out); return 0; out: @@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj) return err; } +int bpf_object__load(struct bpf_object *obj) +{ + struct bpf_object_load_attr attr = { + .obj = obj, + }; + + return bpf_object__load_xattr(&attr); +} + static int check_path(const char *path) { char *cp, errmsg[STRERR_BUFSIZE]; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index c5ff00515ce7..e1c748db44f6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); LIBBPF_API void bpf_object__close(struct bpf_object *object); +struct bpf_object_load_attr { + struct bpf_object *obj; + int log_level; +}; + /* Load/unload object into/from kernel */ LIBBPF_API int bpf_object__load(struct bpf_object *obj); +LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 673001787cba..6ce61fa0baf3 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -164,3 +164,8 @@ LIBBPF_0.0.3 { bpf_map_freeze; btf__finalize_data; } LIBBPF_0.0.2; + +LIBBPF_0.0.4 { + global: + bpf_object__load_xattr; +} LIBBPF_0.0.3; -- 2.17.1
[PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue
XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the heavy cost of indirect call but it also reduces lock acquisition on the destination device that needs locks like veth and tun. XDP_TX does not use indirect calls but drivers which require locks can benefit from the bulk transmit for XDP_TX as well. This patch adds per-cpu queues which can be used for bulk transmit on XDP_TX. I did not add functions like enqueue/flush but exposed the queue directly because we should avoid indirect calls on XDP_TX. Note that the queue must be flushed, i.e. "count" member needs to be set to 0, when a NAPI handler which used this queue exits. Otherwise packets left in the queue will be transmitted from totally unintentional devices. Signed-off-by: Toshiaki Makita --- include/net/xdp.h | 7 +++ net/core/xdp.c| 3 +++ 2 files changed, 10 insertions(+) diff --git a/include/net/xdp.h b/include/net/xdp.h index 0f25b36..30b36c8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -84,6 +84,13 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +#define XDP_TX_BULK_SIZE 16 +struct xdp_tx_bulk_queue { + struct xdp_frame *q[XDP_TX_BULK_SIZE]; + unsigned int count; +}; +DECLARE_PER_CPU(struct xdp_tx_bulk_queue, xdp_tx_bq); + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/net/core/xdp.c b/net/core/xdp.c index 4b2b194..0622f2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -40,6 +40,9 @@ struct xdp_mem_allocator { struct rcu_head rcu; }; +DEFINE_PER_CPU(struct xdp_tx_bulk_queue, xdp_tx_bq); +EXPORT_PER_CPU_SYMBOL_GPL(xdp_tx_bq); + static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) { const u32 *k = data; -- 1.8.3.1
[PATCH bpf-next 0/3] veth: Bulk XDP_TX
This adds an infrastructure for bulk XDP_TX and makes veth use it. Improves XDP_TX performance by approximately 8%. The detailed performance numbers are shown in patch 3. Signed-off-by: Toshiaki Makita Toshiaki Makita (3): xdp: Add bulk XDP_TX queue xdp: Add tracepoint for bulk XDP_TX veth: Support bulk XDP_TX drivers/net/veth.c | 26 +- include/net/xdp.h | 7 +++ include/trace/events/xdp.h | 25 + kernel/bpf/core.c | 1 + net/core/xdp.c | 3 +++ 5 files changed, 61 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH bpf-next 2/3] xdp: Add tracepoint for bulk XDP_TX
This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use. Signed-off-by: Toshiaki Makita --- include/trace/events/xdp.h | 25 + kernel/bpf/core.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index e95cb86..e06ea65 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -50,6 +50,31 @@ __entry->ifindex) ); +TRACE_EVENT(xdp_bulk_tx, + + TP_PROTO(const struct net_device *dev, +int sent, int drops, int err), + + TP_ARGS(dev, sent, drops, err), + + TP_STRUCT__entry( + __field(int, ifindex) + __field(int, drops) + __field(int, sent) + __field(int, err) + ), + + TP_fast_assign( + __entry->ifindex= dev->ifindex; + __entry->drops = drops; + __entry->sent = sent; + __entry->err= err; + ), + + TP_printk("ifindex=%d sent=%d drops=%d err=%d", + __entry->ifindex, __entry->sent, __entry->drops, __entry->err) +); + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643..7687488 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2108,3 +2108,4 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); -- 1.8.3.1
[PATCH bpf-next 3/3] veth: Support bulk XDP_TX
This improves XDP_TX performance by about 8%. Here are single core XDP_TX test results. CPU consumptions are taken from "perf report --no-child". - Before: 7.26 Mpps _raw_spin_lock 7.83% veth_xdp_xmit 12.23% - After: 7.84 Mpps _raw_spin_lock 1.17% veth_xdp_xmit 6.45% Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 26 +- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 52110e5..4edc75f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, return ret; } +static void veth_xdp_flush_bq(struct net_device *dev) +{ + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); + int sent, i, err = 0; + + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); + if (sent < 0) { + err = sent; + sent = 0; + for (i = 0; i < bq->count; i++) + xdp_return_frame(bq->q[i]); + } + trace_xdp_bulk_tx(dev, sent, bq->count - sent, err); + + bq->count = 0; +} + static void veth_xdp_flush(struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); @@ -449,6 +466,7 @@ static void veth_xdp_flush(struct net_device *dev) struct veth_rq *rq; rcu_read_lock(); + veth_xdp_flush_bq(dev); rcv = rcu_dereference(priv->peer); if (unlikely(!rcv)) goto out; @@ -466,12 +484,18 @@ static void veth_xdp_flush(struct net_device *dev) static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); struct xdp_frame *frame = convert_to_xdp_frame(xdp); if (unlikely(!frame)) return -EOVERFLOW; - return veth_xdp_xmit(dev, 1, &frame, 0); + if (unlikely(bq->count == XDP_TX_BULK_SIZE)) + veth_xdp_flush_bq(dev); + + bq->q[bq->count++] = frame; + + return 0; } static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, -- 1.8.3.1
Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
Hi! On Thu, May 23, 2019 at 10:11:39AM +, Jose Abreu wrote: > From: Maxime Ripard > Date: Thu, May 23, 2019 at 10:56:49 > > > Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML > > schema to enable the DT validation. > > > > Signed-off-by: Maxime Ripard > > How exactly can I see the final results of this ? Do you have any link ? > (I'm no expert in YAML at all). You need some extra tooling, that you can find here: https://github.com/devicetree-org/dt-schema You can then run make dtbs_check, and those YAML files will be used to validate that any devicetree using those properties are doing it properly. That implies having the right node names, properties, types, ranges of values when relevant, and so on. Maxime -- Maxime Ripard, Bootlin Embedded Linux and Kernel engineering https://bootlin.com
Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue
Toshiaki Makita writes: > XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to > the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the > heavy cost of indirect call but it also reduces lock acquisition on the > destination device that needs locks like veth and tun. > > XDP_TX does not use indirect calls but drivers which require locks can > benefit from the bulk transmit for XDP_TX as well. XDP_TX happens on the same device, so there's an implicit bulking happening because of the NAPI cycle. So why is an additional mechanism needed (in the general case)? -Toke
RE: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
From: Maxime Ripard Date: Thu, May 23, 2019 at 12:07:15 > You can then run make dtbs_check, and those YAML files will be used to > validate that any devicetree using those properties are doing it > properly. That implies having the right node names, properties, types, > ranges of values when relevant, and so on. Thanks but how can one that's developing know which bindings it shall use ? Is this not parsed/prettified and displayed in some kind of webpage ? Just that now that the TXT is gone its kind of "strange" to look at YAML instead of plain text and develop/use the bindings. Thanks, Jose Miguel Abreu
Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue
On 2019/05/23 20:11, Toke Høiland-Jørgensen wrote: > Toshiaki Makita writes: > >> XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to >> the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the >> heavy cost of indirect call but it also reduces lock acquisition on the >> destination device that needs locks like veth and tun. >> >> XDP_TX does not use indirect calls but drivers which require locks can >> benefit from the bulk transmit for XDP_TX as well. > > XDP_TX happens on the same device, so there's an implicit bulking > happening because of the NAPI cycle. So why is an additional mechanism > needed (in the general case)? Not sure what the implicit bulking you mention is. XDP_TX calls .ndo_xdp_xmit() for each packet, and it acquires a lock in veth and tun. To avoid this, we need additional storage for bulking like devmap for XDP_REDIRECT. -- Toshiaki Makita
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
Toshiaki Makita writes: > This improves XDP_TX performance by about 8%. > > Here are single core XDP_TX test results. CPU consumptions are taken > from "perf report --no-child". > > - Before: > > 7.26 Mpps > > _raw_spin_lock 7.83% > veth_xdp_xmit 12.23% > > - After: > > 7.84 Mpps > > _raw_spin_lock 1.17% > veth_xdp_xmit 6.45% > > Signed-off-by: Toshiaki Makita > --- > drivers/net/veth.c | 26 +- > 1 file changed, 25 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > index 52110e5..4edc75f 100644 > --- a/drivers/net/veth.c > +++ b/drivers/net/veth.c > @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, > return ret; > } > > +static void veth_xdp_flush_bq(struct net_device *dev) > +{ > + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); > + int sent, i, err = 0; > + > + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So you're introducing an additional per-cpu bulk queue, only to avoid lock contention around the existing pointer ring. But the pointer ring is per-rq, so if you have lock contention, this means you must have multiple CPUs servicing the same rq, no? So why not just fix that instead? -Toke
Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue
Toshiaki Makita writes: > On 2019/05/23 20:11, Toke Høiland-Jørgensen wrote: >> Toshiaki Makita writes: >> >>> XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to >>> the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the >>> heavy cost of indirect call but it also reduces lock acquisition on the >>> destination device that needs locks like veth and tun. >>> >>> XDP_TX does not use indirect calls but drivers which require locks can >>> benefit from the bulk transmit for XDP_TX as well. >> >> XDP_TX happens on the same device, so there's an implicit bulking >> happening because of the NAPI cycle. So why is an additional mechanism >> needed (in the general case)? > > Not sure what the implicit bulking you mention is. XDP_TX calls > .ndo_xdp_xmit() for each packet, and it acquires a lock in veth and > tun. To avoid this, we need additional storage for bulking like devmap > for XDP_REDIRECT. The bulking is in veth_poll(), where veth_xdp_flush() is only called at the end. But see my other reply to the veth.c patch for the lock contention issue... -Toke
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote: > Toshiaki Makita writes: > >> This improves XDP_TX performance by about 8%. >> >> Here are single core XDP_TX test results. CPU consumptions are taken >> from "perf report --no-child". >> >> - Before: >> >> 7.26 Mpps >> >> _raw_spin_lock 7.83% >> veth_xdp_xmit 12.23% >> >> - After: >> >> 7.84 Mpps >> >> _raw_spin_lock 1.17% >> veth_xdp_xmit 6.45% >> >> Signed-off-by: Toshiaki Makita >> --- >> drivers/net/veth.c | 26 +- >> 1 file changed, 25 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/net/veth.c b/drivers/net/veth.c >> index 52110e5..4edc75f 100644 >> --- a/drivers/net/veth.c >> +++ b/drivers/net/veth.c >> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, >> return ret; >> } >> >> +static void veth_xdp_flush_bq(struct net_device *dev) >> +{ >> +struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); >> +int sent, i, err = 0; >> + >> +sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); > > Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So > you're introducing an additional per-cpu bulk queue, only to avoid lock > contention around the existing pointer ring. But the pointer ring is > per-rq, so if you have lock contention, this means you must have > multiple CPUs servicing the same rq, no? Yes, it's possible. Not recommended though. > So why not just fix that > instead? The queues are shared with packets from stack sent from peer. That's because I needed the lock. I have tried to separate the queues, one for redirect and one for stack, but receiver side got too complicated and it ended up with worse performance. -- Toshiaki Makita
RE: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw
> Subject: Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw > > > > On 5/22/2019 7:25 PM, Florian Fainelli wrote: > > > > > > On 5/22/2019 6:20 PM, Ioana Ciornei wrote: > >> This adds a new entry point to PHYLINK that does not require a > >> net_device structure. > >> > >> The main intended use are DSA ports that do not have net devices > >> registered for them (mainly because doing so would be redundant - see > >> Documentation/networking/dsa/dsa.rst for details). So far DSA has > >> been using PHYLIB fixed PHYs for these ports, driven manually with > >> genphy instead of starting a full PHY state machine, but this does > >> not scale well when there are actual PHYs that need a driver on those > >> ports, or when a fixed-link is requested in DT that has a speed > >> unsupported by the fixed PHY C22 emulation (such as SGMII-2500). > >> > >> The proposed solution comes in the form of a notifier chain owned by > >> the PHYLINK instance, and the passing of phylink_notifier_info > >> structures back to the driver through a blocking notifier call. > >> > >> The event API exposed by the new notifier mechanism is a 1:1 mapping > >> to the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback. > >> > >> Both the standard phylink_create() function, as well as its raw > >> variant, call the same underlying function which initializes either > >> the netdev field or the notifier block of the PHYLINK instance. > >> > >> All PHYLINK driver callbacks have been extended to call the notifier > >> chain in case the instance is a raw one. > >> > >> Signed-off-by: Ioana Ciornei > >> Signed-off-by: Vladimir Oltean > >> --- > > > > [snip] > > > >> + struct phylink_notifier_info info = { > >> + .link_an_mode = pl->link_an_mode, > >> + /* Discard const pointer */ > >> + .state = (struct phylink_link_state *)state, > >> + }; > >> + > >>netdev_dbg(pl->netdev, > >> "%s: mode=%s/%s/%s/%s adv=%*pb pause=%02x link=%u > an=%u\n", > >> __func__, phylink_an_mode_str(pl->link_an_mode), > >> @@ -299,7 +317,12 @@ static void phylink_mac_config(struct phylink *pl, > >> __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising, > >> state->pause, state->link, state->an_enabled); > > > > Don't you need to guard that netdev_dbg() with an if (pl->ops) to > > avoid de-referencing a NULL net_device? > > The netdev_* print will not dereference a NULL net_device since it has explicit checks agains this. Instead it will just print (net/core/dev.c, __netdev_printk): printk("%s(NULL net_device): %pV", level, vaf); > > Another possibility could be to change the signature of the > > phylink_mac_ops to take an opaque pointer and in the case where we > > called phylink_create() and passed down a net_device pointer, we > > somehow remember that for doing any operation that requires a > > net_device (printing, setting carrier). We lose strict typing in doing > > that, but we'd have fewer places to patch for a blocking notifier call. > > > > Or even make those functions part of phylink_mac_ops such that the caller > could pass an .carrier_ok callback which is netif_carrier_ok() for a > net_device, > else it's NULL, same with printing functions if desired... > -- > Florian Let me see if I understood this correctly. I presume that any API that we add should not break any current PHYLINK users. You suggest to change the prototype of the phylink_mac_ops from void (*validate)(struct net_device *ndev, unsigned long *supported, struct phylink_link_state *state); to something that takes a void pointer: void (*validate)(void *dev, unsigned long *supported, struct phylink_link_state *state); This would imply that the any function in PHYLINK would have to somehow differentiate if the dev provided is indeed a net_device or another structure in order to make the decision if netif_carrier_off should be called or not (this is so we do not break any drivers using PHYLINK). I cannot see how this judgement can be made. -- Ioana
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
Toshiaki Makita writes: > On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote: >> Toshiaki Makita writes: >> >>> This improves XDP_TX performance by about 8%. >>> >>> Here are single core XDP_TX test results. CPU consumptions are taken >>> from "perf report --no-child". >>> >>> - Before: >>> >>> 7.26 Mpps >>> >>> _raw_spin_lock 7.83% >>> veth_xdp_xmit 12.23% >>> >>> - After: >>> >>> 7.84 Mpps >>> >>> _raw_spin_lock 1.17% >>> veth_xdp_xmit 6.45% >>> >>> Signed-off-by: Toshiaki Makita >>> --- >>> drivers/net/veth.c | 26 +- >>> 1 file changed, 25 insertions(+), 1 deletion(-) >>> >>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c >>> index 52110e5..4edc75f 100644 >>> --- a/drivers/net/veth.c >>> +++ b/drivers/net/veth.c >>> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, >>> return ret; >>> } >>> >>> +static void veth_xdp_flush_bq(struct net_device *dev) >>> +{ >>> + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); >>> + int sent, i, err = 0; >>> + >>> + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); >> >> Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So >> you're introducing an additional per-cpu bulk queue, only to avoid lock >> contention around the existing pointer ring. But the pointer ring is >> per-rq, so if you have lock contention, this means you must have >> multiple CPUs servicing the same rq, no? > > Yes, it's possible. Not recommended though. > >> So why not just fix that instead? > > The queues are shared with packets from stack sent from peer. That's > because I needed the lock. I have tried to separate the queues, one for > redirect and one for stack, but receiver side got too complicated and it > ended up with worse performance. I meant fix it with configuration. Now many receive queues are you running on the veth device in your benchmarks, and how have you configured the RPS? -Toke
Re: DSA setup IMX6ULL and Marvell 88E6390 with 2 Ethernet Phys - CPU Port is not working
> thanks for your reply. You're right, both PHYs are 10/100. > > I already added a fixed-link like this: > > port@0 { > reg = <0>; > label = "cpu"; > ethernet = <&fec1>; > phy-mode = "rmii"; > phy-handle = <&switch0phy0>; > fixed-link { > speed = <100>; > full-duplex; > }; > }; > > I hope you mean that with fixed-phy? But this doesn't changed anything. You probably have multiple issues, and it is not going to work until you have them all solved. You can get access to the registers etc, using patches from: https://github.com/vivien/linux.git dsa/debugfs I've only seen the external MDIO bus on the 6390 used for C45 PHYs. So there is a chance the driver code for C22 is broken? Andrew
Re: [PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options
On Thu, May 23, 2019 at 4:57 AM Maxime Ripard wrote: > > The Ethernet controllers have a good number of generic options that can be > needed in a device tree. Add a YAML schemas for those. > > Signed-off-by: Maxime Ripard > --- > Documentation/devicetree/bindings/net/ethernet-controller.yaml | 197 +++- > Documentation/devicetree/bindings/net/ethernet.txt | 68 +-- > Documentation/devicetree/bindings/net/fixed-link.txt | 55 +-- > 3 files changed, 199 insertions(+), 121 deletions(-) > create mode 100644 > Documentation/devicetree/bindings/net/ethernet-controller.yaml > > diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml > b/Documentation/devicetree/bindings/net/ethernet-controller.yaml > new file mode 100644 > index ..1c6e9e755481 > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml > @@ -0,0 +1,197 @@ > +# SPDX-License-Identifier: GPL-2.0 > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/ethernet-controller.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Ethernet Controller Generic Binding > + > +maintainers: > + - David S. Miller > + > +properties: > + $nodename: > +pattern: "^ethernet(@.*)?$" > + > + local-mac-address: > +allOf: > + - $ref: /schemas/types.yaml#definitions/uint8-array > + - minItems: 6 > +maxItems: 6 > +description: > + Specifies the MAC address that was assigned to the network device. > + > + mac-address: > +allOf: > + - $ref: /schemas/types.yaml#definitions/uint8-array > + - minItems: 6 > +maxItems: 6 > +description: > + Specifies the MAC address that was last used by the boot > + program; should be used in cases where the MAC address assigned > + to the device by the boot program is different from the > + local-mac-address property. > + > + max-frame-size: > +$ref: /schemas/types.yaml#definitions/uint32 > +description: > + Maximum transfer unit (IEEE defined MTU), rather than the > + maximum frame size (there\'s contradiction in the Devicetree > + Specification). > + > + max-speed: > +$ref: /schemas/types.yaml#definitions/uint32 > +description: > + Specifies maximum speed in Mbit/s supported by the device. > + > + nvmem-cells: > +maxItems: 1 > +description: > + Reference to an nvmem node for the MAC address > + > + nvmem-cells-names: > +const: mac-address > + > + phy-connection-type: > +description: > + Operation mode of the PHY interface > +oneOf: While yes, this lets us have descriptions, oneOf makes for poor error messages. I'd just make the descriptions comments instead. > + - const: internal > +description: > + there is not a standard bus between the MAC and the PHY, > + something proprietary is being used to embed the PHY in the > + MAC. > + - const: mii > + - const: gmii > + - const: sgmii > + - const: qsgmii > + - const: tbi > + - const: rev-mii > + - const: rmii > + - const: rgmii > +description: > + RX and TX delays are added by the MAC when required > + - const: rgmii-id > +description: > + RGMII with internal RX and TX delays provided by the PHY, > + the MAC should not add the RX or TX delays in this case > + - const: rgmii-rxid > +description: > + RGMII with internal RX delay provided by the PHY, the MAC > + should not add an RX delay in this case > + - const: rgmii-txid > +description: > + RGMII with internal TX delay provided by the PHY, the MAC > + should not add an TX delay in this case > + - const: rtbi > + - const: smii > + - const: xgmii > + - const: trgmii > + - const: 1000base-x > + - const: 2500base-x > + - const: rxaui > + - const: xaui > + - const: 10gbase-kr > +description: > + 10GBASE-KR, XFI, SFI > + > + phy-mode: > +$ref: "#/properties/phy-connection-type" > +description: > + Deprecated in favor of phy-connection-type > + > + phy-handle: > +$ref: /schemas/types.yaml#definitions/phandle > +description: > + Specifies a reference to a node representing a PHY device. > + > + phy: > +$ref: "#/properties/phy-handle" > +description: > + Deprecated in favor of phy-handle > + > + phy-device: > +$ref: "#/properties/phy-handle" > +description: > + Deprecated in favor of phy-handle > + > + rx-fifo-depth: > +$ref: /schemas/types.yaml#definitions/uint32 > +description: > + The size of the controller\'s receive fifo in bytes. This is used > + for components that can have configurable receive fifo sizes, > + and is useful for determining certain configuration settings > + such as flow control thresholds. > + > + tx-fifo-depth: > +$ref:
Re: [PATCH bpf-next 2/3] xdp: Add tracepoint for bulk XDP_TX
On Thu, 23 May 2019 19:56:47 +0900 Toshiaki Makita wrote: > This is introduced for admins to check what is happening on XDP_TX when > bulk XDP_TX is in use. > > Signed-off-by: Toshiaki Makita > --- > include/trace/events/xdp.h | 25 + > kernel/bpf/core.c | 1 + > 2 files changed, 26 insertions(+) > > diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h > index e95cb86..e06ea65 100644 > --- a/include/trace/events/xdp.h > +++ b/include/trace/events/xdp.h > @@ -50,6 +50,31 @@ > __entry->ifindex) > ); > > +TRACE_EVENT(xdp_bulk_tx, > + You are using this tracepoint like/instead of trace_xdp_devmap_xmit if I understand correctly? Or maybe the trace_xdp_redirect tracepoint. The point is that is will be good if the tracepoints can share the TP_STRUCT layout beginning, as it allows for attaching and reusing eBPF code that is only interested in the top part of the struct. I would also want to see some identifier, that trace programs can use to group and corrolate these events, you do have ifindex, but most other XDP tracepoints also have "prog_id". > + TP_PROTO(const struct net_device *dev, > + int sent, int drops, int err), > + > + TP_ARGS(dev, sent, drops, err), > + > + TP_STRUCT__entry( > + __field(int, ifindex) > + __field(int, drops) > + __field(int, sent) > + __field(int, err) > + ), The xdp_redirect_template have: TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), And e.g. TRACE_EVENT xdp_exception have: TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), > + > + TP_fast_assign( > + __entry->ifindex= dev->ifindex; > + __entry->drops = drops; > + __entry->sent = sent; > + __entry->err= err; > + ), > + > + TP_printk("ifindex=%d sent=%d drops=%d err=%d", > + __entry->ifindex, __entry->sent, __entry->drops, __entry->err) > +); > + > DECLARE_EVENT_CLASS(xdp_redirect_template, > > TP_PROTO(const struct net_device *dev, > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 242a643..7687488 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2108,3 +2108,4 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int > offset, void *to, > #include > > EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); > +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics
On 2019-05-22 6:20 p.m., Jakub Kicinski wrote: On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote: * removed RFC tags Why? There is still no upstream user for this (my previous objections of this being only partially correct aside). IIRC your point was to get the dumping to work with RTM_GETACTION. That would still work here, no? There will be some latency based on the frequency of hardware->kernel stats updates. cheers, jamal
[PATCH nf-next] netfilter: add support for matching IPv4 options
This is the kernel change for the overall changes with this description: Add capability to have rules matching IPv4 options. This is developed mainly to support dropping of IP packets with loose and/or strict source route route options. Nevertheless, the implementation include others and ability to get specific fields in the option. Signed-off-by: Stephen Suryaputra --- include/net/inet_sock.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 + net/ipv4/ip_options.c| 2 + net/netfilter/nft_exthdr.c | 136 +++ 4 files changed, 141 insertions(+), 1 deletion(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e8eef85006aa..8db4f8639a33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -55,7 +55,7 @@ struct ip_options { ts_needaddr:1; unsigned char router_alert; unsigned char cipso; - unsigned char __pad2; + unsigned char end; unsigned char __data[0]; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 061bb3eb20c3..81d31f4e4aa3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -730,10 +730,12 @@ enum nft_exthdr_flags { * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_IPV4: match against ipv4 options */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, + NFT_EXTHDR_OP_IPV4, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..fc0e694aa97c 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -272,6 +272,7 @@ int __ip_options_compile(struct net *net, for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: + opt->end = optptr - iph; for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; @@ -473,6 +474,7 @@ int __ip_options_compile(struct net *net, *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a940c9fd9045..c4d47d274bbe 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -62,6 +62,128 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option or the header beyond the options + * if target < 0. + * + * Note that *offset is used as input/output parameter, and if it is not zero, + * then it must be a valid offset to an inner IPv4 header. This can be used + * to explore inner IPv4 header, eg. ICMP error messages. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned int *offset, +int target, unsigned short *fragoff, int *flags) +{ + unsigned char optbuf[sizeof(struct ip_options) + 41]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + __be32 info; + int optlen; + bool found = false; + + if (fragoff) + *fragoff = 0; + + if (!offset) + return -EINVAL; + if (!*offset) + *offset = skb_network_offset(skb); + + iph = skb_header_pointer(skb, *offset, sizeof(_iph), &_iph); + if (!iph || iph->version != 4) + return -EBADMSG; + start = *offset + sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies +* the options. Get one byte beyond the option for target < 0 +*/ + if (skb_copy_bits(skb, start, opt->__data, optlen + 1)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (opt->srr) { + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; +
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
On Thu, 23 May 2019 20:35:50 +0900 Toshiaki Makita wrote: > On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote: > > Toshiaki Makita writes: > > > >> This improves XDP_TX performance by about 8%. > >> > >> Here are single core XDP_TX test results. CPU consumptions are taken > >> from "perf report --no-child". > >> > >> - Before: > >> > >> 7.26 Mpps > >> > >> _raw_spin_lock 7.83% > >> veth_xdp_xmit 12.23% > >> > >> - After: > >> > >> 7.84 Mpps > >> > >> _raw_spin_lock 1.17% > >> veth_xdp_xmit 6.45% > >> > >> Signed-off-by: Toshiaki Makita > >> --- > >> drivers/net/veth.c | 26 +- > >> 1 file changed, 25 insertions(+), 1 deletion(-) > >> > >> diff --git a/drivers/net/veth.c b/drivers/net/veth.c > >> index 52110e5..4edc75f 100644 > >> --- a/drivers/net/veth.c > >> +++ b/drivers/net/veth.c > >> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int > >> n, > >>return ret; > >> } > >> > >> +static void veth_xdp_flush_bq(struct net_device *dev) > >> +{ > >> + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); > >> + int sent, i, err = 0; > >> + > >> + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); > > > > Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So > > you're introducing an additional per-cpu bulk queue, only to avoid lock > > contention around the existing pointer ring. But the pointer ring is > > per-rq, so if you have lock contention, this means you must have > > multiple CPUs servicing the same rq, no? > > Yes, it's possible. Not recommended though. > I think the general per-cpu TX bulk queue is overkill. There is a loop over packets in veth_xdp_rcv(struct veth_rq *rq, budget, *status), and the caller veth_poll() will call veth_xdp_flush(rq->dev). Why can't you store this "temp" bulk array in struct veth_rq ? You could even alloc/create it on the stack of veth_poll() and send it along via a pointer to veth_xdp_rcv). -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
On 19/05/23 (木) 21:18:25, Toke Høiland-Jørgensen wrote: Toshiaki Makita writes: On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote: Toshiaki Makita writes: This improves XDP_TX performance by about 8%. Here are single core XDP_TX test results. CPU consumptions are taken from "perf report --no-child". - Before: 7.26 Mpps _raw_spin_lock 7.83% veth_xdp_xmit 12.23% - After: 7.84 Mpps _raw_spin_lock 1.17% veth_xdp_xmit 6.45% Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 26 +- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 52110e5..4edc75f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, return ret; } +static void veth_xdp_flush_bq(struct net_device *dev) +{ + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); + int sent, i, err = 0; + + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So you're introducing an additional per-cpu bulk queue, only to avoid lock contention around the existing pointer ring. But the pointer ring is per-rq, so if you have lock contention, this means you must have multiple CPUs servicing the same rq, no? Yes, it's possible. Not recommended though. So why not just fix that instead? The queues are shared with packets from stack sent from peer. That's because I needed the lock. I have tried to separate the queues, one for redirect and one for stack, but receiver side got too complicated and it ended up with worse performance. I meant fix it with configuration. Now many receive queues are you running on the veth device in your benchmarks, and how have you configured the RPS? As I wrote this test is a single queue test and does not have any contention. Per packet lock has some overhead even in that configuration. Toshiaki Makita
Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX
On 19/05/23 (木) 22:29:27, Jesper Dangaard Brouer wrote: On Thu, 23 May 2019 20:35:50 +0900 Toshiaki Makita wrote: On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote: Toshiaki Makita writes: This improves XDP_TX performance by about 8%. Here are single core XDP_TX test results. CPU consumptions are taken from "perf report --no-child". - Before: 7.26 Mpps _raw_spin_lock 7.83% veth_xdp_xmit 12.23% - After: 7.84 Mpps _raw_spin_lock 1.17% veth_xdp_xmit 6.45% Signed-off-by: Toshiaki Makita --- drivers/net/veth.c | 26 +- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 52110e5..4edc75f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n, return ret; } +static void veth_xdp_flush_bq(struct net_device *dev) +{ + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq); + int sent, i, err = 0; + + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0); Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So you're introducing an additional per-cpu bulk queue, only to avoid lock contention around the existing pointer ring. But the pointer ring is per-rq, so if you have lock contention, this means you must have multiple CPUs servicing the same rq, no? Yes, it's possible. Not recommended though. I think the general per-cpu TX bulk queue is overkill. There is a loop over packets in veth_xdp_rcv(struct veth_rq *rq, budget, *status), and the caller veth_poll() will call veth_xdp_flush(rq->dev). Why can't you store this "temp" bulk array in struct veth_rq ? Of course I can. But I thought tun has the same problem and we can decrease memory footprint by sharing the same storage between devices. Or if other devices want to reduce queues so that we can use XDP on many-cpu servers and introduce locks, we can use this storage for that case as well. Still do you prefer veth-specific solution? You could even alloc/create it on the stack of veth_poll() and send it along via a pointer to veth_xdp_rcv). Toshiaki Makita
[PATCH net-next] cxgb4: use firmware API for validating filter spec
Adds support for validating hardware filter spec configured in firmware before offloading exact match flows. Use the new fw api FW_PARAM_DEV_FILTER_MODE_MASK to read the filter mode and mask from firmware. If the api isn't supported, then fall-back to older way of reading just the mode from indirect register. Signed-off-by: Raju Rangoju --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h| 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 3 +- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c| 47 --- drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 23 +++ 4 files changed, 68 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index a707a65f491b..7c06e2aebc9e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -280,6 +280,7 @@ struct tp_params { unsigned short tx_modq[NCHAN]; /* channel to modulation queue map */ u32 vlan_pri_map; /* cached TP_VLAN_PRI_MAP */ + u32 filter_mask; u32 ingress_config; /* cached TP_INGRESS_CONFIG */ /* cached TP_OUT_CONFIG compressed error vector diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c index 0f0b3f4a039d..6232236d7abc 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c @@ -248,8 +248,9 @@ static int validate_filter(struct net_device *dev, u32 fconf, iconf; /* Check for unconfigured fields being used. */ - fconf = adapter->params.tp.vlan_pri_map; iconf = adapter->params.tp.ingress_config; + fconf = fs->hash ? adapter->params.tp.filter_mask : + adapter->params.tp.vlan_pri_map; if (unsupported(fconf, FCOE_F, fs->val.fcoe, fs->mask.fcoe) || unsupported(fconf, PORT_F, fs->val.iport, fs->mask.iport) || diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 866ee31a1da5..8c17d1c75e84 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -9388,8 +9388,9 @@ int t4_init_sge_params(struct adapter *adapter) */ int t4_init_tp_params(struct adapter *adap, bool sleep_ok) { - int chan; - u32 v; + u32 param, val, v; + int chan, ret; + v = t4_read_reg(adap, TP_TIMER_RESOLUTION_A); adap->params.tp.tre = TIMERRESOLUTION_G(v); @@ -9399,11 +9400,47 @@ int t4_init_tp_params(struct adapter *adap, bool sleep_ok) for (chan = 0; chan < NCHAN; chan++) adap->params.tp.tx_modq[chan] = chan; - /* Cache the adapter's Compressed Filter Mode and global Incress + /* Cache the adapter's Compressed Filter Mode/Mask and global Ingress * Configuration. */ - t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1, - TP_VLAN_PRI_MAP_A, sleep_ok); + param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) | +FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_FILTER) | +FW_PARAMS_PARAM_Y_V(FW_PARAM_DEV_FILTER_MODE_MASK)); + + /* Read current value */ + ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1, + ¶m, &val); + if (ret == 0) { + dev_info(adap->pdev_dev, +"Current filter mode/mask 0x%x:0x%x\n", +FW_PARAMS_PARAM_FILTER_MODE_G(val), +FW_PARAMS_PARAM_FILTER_MASK_G(val)); + adap->params.tp.vlan_pri_map = + FW_PARAMS_PARAM_FILTER_MODE_G(val); + adap->params.tp.filter_mask = + FW_PARAMS_PARAM_FILTER_MASK_G(val); + } else { + dev_info(adap->pdev_dev, +"Failed to read filter mode/mask via fw api, using indirect-reg-read\n"); + + /* Incase of older-fw (which doesn't expose the api +* FW_PARAM_DEV_FILTER_MODE_MASK) and newer-driver (which uses +* the fw api) combination, fall-back to older method of reading +* the filter mode from indirect-register +*/ + t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1, + TP_VLAN_PRI_MAP_A, sleep_ok); + + /* With the older-fw and newer-driver combination we might run +* into an issue when user wants to use hash filter region but +* the filter_mask is zero, in this case filter_mask validation +* is tough. To avoid that we set the filter_mask same as filter +* mode, which will behave exactly as the older way of ignoring +* the filter mask validation. +*/ + adap->params.tp.filter_
Re: Question about IRQs during the .remove() of virtio-vsock driver
On Wed, May 22, 2019 at 11:44:26AM +0800, Jason Wang wrote: > > On 2019/5/21 下午9:56, Michael S. Tsirkin wrote: > > On Tue, May 21, 2019 at 03:49:20PM +0200, Stefano Garzarella wrote: > > > On Tue, May 21, 2019 at 06:05:31AM -0400, Michael S. Tsirkin wrote: > > > > On Tue, May 21, 2019 at 11:44:07AM +0200, Stefano Garzarella wrote: > > > > > Hi Micheal, Jason, > > > > > as suggested by Stefan, I'm checking if we have some races in the > > > > > virtio-vsock driver. We found some races in the .probe() and .remove() > > > > > with the upper layer (socket) and I'll fix it. > > > > > > > > > > Now my attention is on the bottom layer (virtio device) and my > > > > > question is: > > > > > during the .remove() of virtio-vsock driver (virtio_vsock_remove), > > > > > could happen > > > > > that an IRQ comes and one of our callback (e.g. > > > > > virtio_vsock_rx_done()) is > > > > > executed, queueing new works? > > > > > > > > > > I tried to follow the code in both cases (device unplugged or module > > > > > removed) > > > > > and maybe it couldn't happen because we remove it from bus's > > > > > knowledge, > > > > > but I'm not sure and your advice would be very helpful. > > > > > > > > > > Thanks in advance, > > > > > Stefano > > > > > > > > Great question! This should be better documented: patches welcome! > > > When I'm clear, I'll be happy to document this. > > > > > > > Here's my understanding: > > > > > > > > > > > > A typical removal flow works like this: > > > > > > > > - prevent linux from sending new kick requests to device > > > >and flush such outstanding requests if any > > > >(device can still send notifications to linux) > > > > > > > > - call > > > >vi->vdev->config->reset(vi->vdev); > > > >this will flush all device writes and interrupts. > > > >device will not use any more buffers. > > > >previously outstanding callbacks might still be active. > > > > > > > > - Then call > > > >vdev->config->del_vqs(vdev); > > > >to flush outstanding callbacks if any. > > > Thanks for sharing these useful information. > > > > > > So, IIUC between step 1 (e.g. in virtio-vsock we flush all work-queues) > > > and > > > step 2, new IRQs could happen, and in the virtio-vsock driver new work > > > will be queued. > > > > > > In order to handle this case, I'm thinking to add a new variable > > > 'work_enabled' in the struct virtio_vsock, put it to false at the start > > > of the .remove(), then call synchronize_rcu() before to flush all work > > > queues > > > and use an helper function virtio_transport_queue_work() to queue > > > a new work, where the check of work_enabled and the queue_work are in the > > > RCU read critical section. > > > > > > Here a pseudo code to explain better the idea: > > > > > > virtio_vsock_remove() { > > > vsock->work_enabled = false; > > > > > > /* Wait for other CPUs to finish to queue works */ > > > synchronize_rcu(); > > > > > > flush_works(); > > > > > > vdev->config->reset(vdev); > > > > > > ... > > > > > > vdev->config->del_vqs(vdev); > > > } > > > > > > virtio_vsock_queue_work(vsock, work) { > > > rcu_read_lock(); > > > > > > if (!vsock->work_enabled) { > > > goto out; > > > } > > > > > > queue_work(virtio_vsock_workqueue, work); > > > > > > out: > > > rcu_read_unlock(); > > > } > > > > > > > > > Do you think can work? > > > Please tell me if there is a better way to handle this case. > > > > > > Thanks, > > > Stefano > > > > instead of rcu tricks I would just have rx_run and tx_run and check it > > within the queued work - presumably under tx or rx lock. > > > > then queueing an extra work becomes harmless, > > and you flush it after del vqs which flushes everything for you. > > > > > > It looks to me that we need guarantee no work queued or scheduled before > del_vqs. Otherwise it may lead use after free? (E.g net disable NAPI before > del_vqs). > I'm disabling works before del_vqs (setting tx_run or rx_run to false under the tx or rx lock), this means that work can still be queued, but the worker function will exit just after tx or rx lock is acquired, where I'm checking the tx_run or rx_run, so no workers should use the virtqueues. Then I'll call del_vqs and I'll flush all works before to free the 'vsock' object (struct virtio_vsock) to be sure that no workers will run. I hope this will match your advice. Thanks, Stefano
Re: [PATCH bpf] bpf, riscv: clear target register high 32-bits for and/or/xor on ALU32
On 05/21/2019 04:12 PM, Björn Töpel wrote: > On Tue, 21 May 2019 at 16:02, Daniel Borkmann wrote: >> On 05/21/2019 03:46 PM, Björn Töpel wrote: >>> When using 32-bit subregisters (ALU32), the RISC-V JIT would not clear >>> the high 32-bits of the target register and therefore generate >>> incorrect code. >>> >>> E.g., in the following code: >>> >>> $ cat test.c >>> unsigned int f(unsigned long long a, >>> unsigned int b) >>> { >>> return (unsigned int)a & b; >>> } >>> >>> $ clang-9 -target bpf -O2 -emit-llvm -S test.c -o - | \ >>> llc-9 -mattr=+alu32 -mcpu=v3 >>> .text >>> .file "test.c" >>> .globl f >>> .p2align3 >>> .type f,@function >>> f: >>> r0 = r1 >>> w0 &= w2 >>> exit >>> .Lfunc_end0: >>> .size f, .Lfunc_end0-f >>> >>> The JIT would not clear the high 32-bits of r0 after the >>> and-operation, which in this case might give an incorrect return >>> value. >>> >>> After this patch, that is not the case, and the upper 32-bits are >>> cleared. >>> >>> Reported-by: Jiong Wang >>> Fixes: 2353ecc6f91f ("bpf, riscv: add BPF JIT for RV64G") >>> Signed-off-by: Björn Töpel >> >> Was this missed because test_verifier did not have test coverage? > > Yup, and Jiong noted it. Applied, thanks!
Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor
On 05/23/2019 08:38 AM, Y Song wrote: > On Wed, May 22, 2019 at 1:46 PM Björn Töpel wrote: >> On Wed, 22 May 2019 at 20:13, Y Song wrote: >>> On Wed, May 22, 2019 at 2:25 AM Björn Töpel wrote: Add three tests to test_verifier/basic_instr that make sure that the high 32-bits of the destination register is cleared after an ALU32 and/or/xor. Signed-off-by: Björn Töpel >>> >>> I think the patch intends for bpf-next, right? The patch itself looks >>> good to me. >>> Acked-by: Yonghong Song >> >> Thank you. Actually, it was intended for the bpf tree, as a test >> follow up for this [1] fix. > Then maybe you want to add a Fixes tag and resubmit? Why would the test case need a fixes tag? It's common practice that we have BPF fixes that we queue to bpf tree along with kselftest test cases related to them. Therefore, applied as well, thanks for following up! Björn, in my email from the fix, I mentioned we should have test snippets ideally for all of the alu32 insns to not miss something falling through the cracks when JITs get added or changed. If you have some cycles to add the remaining missing ones, that would be much appreciated. Thanks, Daniel
Re: [bpf PATCH] bpf: sockmap, restore sk_write_space when psock gets dropped
On 05/22/2019 12:01 PM, Jakub Sitnicki wrote: > Once psock gets unlinked from its sock (sk_psock_drop), user-space can > still trigger a call to sk->sk_write_space by setting TCP_NOTSENT_LOWAT > socket option. This causes a null-ptr-deref because we try to read > psock->saved_write_space from sk_psock_write_space: > > == > BUG: KASAN: null-ptr-deref in sk_psock_write_space+0x69/0x80 > Read of size 8 at addr 01a0 by task sockmap-echo/131 > > CPU: 0 PID: 131 Comm: sockmap-echo Not tainted 5.2.0-rc1-00094-gf49aa1de9836 > #81 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > ?-20180724_192412-buildhw-07.phx2.fedoraproject.org-1.fc29 04/01/2014 > Call Trace: > ? sk_psock_write_space+0x69/0x80 > __kasan_report.cold.2+0x5/0x3f > ? sk_psock_write_space+0x69/0x80 > kasan_report+0xe/0x20 > sk_psock_write_space+0x69/0x80 > tcp_setsockopt+0x69a/0xfc0 > ? tcp_shutdown+0x70/0x70 > ? fsnotify+0x5b0/0x5f0 > ? remove_wait_queue+0x90/0x90 > ? __fget_light+0xa5/0xf0 > __sys_setsockopt+0xe6/0x180 > ? sockfd_lookup_light+0xb0/0xb0 > ? vfs_write+0x195/0x210 > ? ksys_write+0xc9/0x150 > ? __x64_sys_read+0x50/0x50 > ? __bpf_trace_x86_fpu+0x10/0x10 > __x64_sys_setsockopt+0x61/0x70 > do_syscall_64+0xc5/0x520 > ? vmacache_find+0xc0/0x110 > ? syscall_return_slowpath+0x110/0x110 > ? handle_mm_fault+0xb4/0x110 > ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe > ? trace_hardirqs_off_caller+0x4b/0x120 > ? trace_hardirqs_off_thunk+0x1a/0x3a > entry_SYSCALL_64_after_hwframe+0x49/0xbe > RIP: 0033:0x7f2e5e7cdcce > Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 66 2e 0f 1f 84 00 00 00 00 00 > 0f 1f 44 00 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 01 f0 ff > ff 73 01 c3 48 8b 0d 8a 11 0c 00 f7 d8 64 89 01 48 > RSP: 002b:7ffed011b778 EFLAGS: 0206 ORIG_RAX: 0036 > RAX: ffda RBX: 0003 RCX: 7f2e5e7cdcce > RDX: 0019 RSI: 0006 RDI: 0007 > RBP: 7ffed011b790 R08: 0004 R09: 7f2e5e84ee80 > R10: 7ffed011b788 R11: 0206 R12: 7ffed011b78c > R13: 7ffed011b788 R14: 0007 R15: 0068 > == > > Restore the saved sk_write_space callback when psock is being dropped to > fix the crash. > > Signed-off-by: Jakub Sitnicki LGTM, applied thanks!
Re: [PATCH v7 bpf-next 01/16] bpf: verifier: mark verified-insn with sub-register zext flag
> On 23 May 2019, at 03:07, Alexei Starovoitov > wrote: > > On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote: >> eBPF ISA specification requires high 32-bit cleared when low 32-bit >> sub-register is written. This applies to destination register of ALU32 etc. >> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and >> AArch64 ISA has the same semantics, so the corresponding JIT back-end >> doesn't need to do extra work. >> >> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches >> (PowerPC, SPARC etc) need to do explicit zero extension to meet this >> requirement, otherwise code like the following will fail. >> >> u64_value = (u64) u32_value >> ... other uses of u64_value >> >> This is because compiler could exploit the semantic described above and >> save those zero extensions for extending u32_value to u64_value, these JIT >> back-ends are expected to guarantee this through inserting extra zero >> extensions which however could be a significant increase on the code size. >> Some benchmarks show there could be ~40% sub-register writes out of total >> insns, meaning at least ~40% extra code-gen. >> >> One observation is these extra zero extensions are not always necessary. >> Take above code snippet for example, it is possible u32_value will never be >> casted into a u64, the value of high 32-bit of u32_value then could be >> ignored and extra zero extension could be eliminated. >> >> This patch implements this idea, insns defining sub-registers will be >> marked when the high 32-bit of the defined sub-register matters. For >> those unmarked insns, it is safe to eliminate high 32-bit clearnace for >> them. >> >> Algo: >> - Split read flags into READ32 and READ64. >> >> - Record index of insn that does sub-register write. Keep the index inside >> reg state and update it during verifier insn walking. >> >> - A full register read on a sub-register marks its definition insn as >> needing zero extension on dst register. >> >> A new sub-register write overrides the old one. >> >> - When propagating read64 during path pruning, also mark any insn defining >> a sub-register that is read in the pruned path as full-register. >> >> Reviewed-by: Jakub Kicinski >> Signed-off-by: Jiong Wang >> --- >> include/linux/bpf_verifier.h | 14 +++- >> kernel/bpf/verifier.c| 175 >> +++ >> 2 files changed, 173 insertions(+), 16 deletions(-) >> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h >> index 1305ccb..60fb54e 100644 >> --- a/include/linux/bpf_verifier.h >> +++ b/include/linux/bpf_verifier.h >> @@ -36,9 +36,11 @@ >> */ >> enum bpf_reg_liveness { >> REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ >> -REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ >> -REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ >> -REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore >> */ >> +REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial >> value */ >> +REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ >> +REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, >> +REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later >> reads */ >> +REG_LIVE_DONE = 0x8, /* liveness won't be updating this register >> anymore */ >> }; >> >> struct bpf_reg_state { >> @@ -131,6 +133,11 @@ struct bpf_reg_state { >> * pointing to bpf_func_state. >> */ >> u32 frameno; >> +/* Tracks subreg definition. The stored value is the insn_idx of the >> + * writing insn. This is safe because subreg_def is used before any insn >> + * patching which only happens after main verification finished. >> + */ >> +s32 subreg_def; >> enum bpf_reg_liveness live; >> }; >> >> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data { >> int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ >> int sanitize_stack_off; /* stack slot to be cleared */ >> bool seen; /* this insn was processed by the verifier */ >> +bool zext_dst; /* this insn zero extends dst reg */ >> u8 alu_state; /* used in combination with alu_limit */ >> unsigned int orig_idx; /* original instruction index */ >> }; >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c >> index 95f93544..0efccf8 100644 >> --- a/kernel/bpf/verifier.c >> +++ b/kernel/bpf/verifier.c >> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env >> *env, >> __mark_reg_not_init(regs + regno); >> } >> >> +#define DEF_NOT_SUBREG (-1) >> static void init_reg_state(struct bpf_verifier_env *env, >> struct bpf_func_state *state) >> { >> @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env, >> mark_reg_not_init(env, regs, i); >> regs[i].live = REG_LIV
Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor
> On 23 May 2019, at 15:02, Daniel Borkmann wrote: > > On 05/23/2019 08:38 AM, Y Song wrote: >> On Wed, May 22, 2019 at 1:46 PM Björn Töpel wrote: >>> On Wed, 22 May 2019 at 20:13, Y Song wrote: On Wed, May 22, 2019 at 2:25 AM Björn Töpel wrote: > > Add three tests to test_verifier/basic_instr that make sure that the > high 32-bits of the destination register is cleared after an ALU32 > and/or/xor. > > Signed-off-by: Björn Töpel I think the patch intends for bpf-next, right? The patch itself looks good to me. Acked-by: Yonghong Song >>> >>> Thank you. Actually, it was intended for the bpf tree, as a test >>> follow up for this [1] fix. >> Then maybe you want to add a Fixes tag and resubmit? > > Why would the test case need a fixes tag? It's common practice that we have > BPF fixes that we queue to bpf tree along with kselftest test cases related > to them. Therefore, applied as well, thanks for following up! > > Björn, in my email from the fix, I mentioned we should have test snippets > ideally for all of the alu32 insns to not miss something falling through the > cracks when JITs get added or changed. If you have some cycles to add the > remaining missing ones, that would be much appreciated. Björn, If you don’t have time, I can take this alu32 test case follow up as well. Regards, Jiong > > Thanks, > Daniel
Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw
On 5/23/2019 5:10 AM, Ioana Ciornei wrote: > >> Subject: Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw >> >> >> >> On 5/22/2019 7:25 PM, Florian Fainelli wrote: >>> >>> >>> On 5/22/2019 6:20 PM, Ioana Ciornei wrote: This adds a new entry point to PHYLINK that does not require a net_device structure. The main intended use are DSA ports that do not have net devices registered for them (mainly because doing so would be redundant - see Documentation/networking/dsa/dsa.rst for details). So far DSA has been using PHYLIB fixed PHYs for these ports, driven manually with genphy instead of starting a full PHY state machine, but this does not scale well when there are actual PHYs that need a driver on those ports, or when a fixed-link is requested in DT that has a speed unsupported by the fixed PHY C22 emulation (such as SGMII-2500). The proposed solution comes in the form of a notifier chain owned by the PHYLINK instance, and the passing of phylink_notifier_info structures back to the driver through a blocking notifier call. The event API exposed by the new notifier mechanism is a 1:1 mapping to the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback. Both the standard phylink_create() function, as well as its raw variant, call the same underlying function which initializes either the netdev field or the notifier block of the PHYLINK instance. All PHYLINK driver callbacks have been extended to call the notifier chain in case the instance is a raw one. Signed-off-by: Ioana Ciornei Signed-off-by: Vladimir Oltean --- >>> >>> [snip] >>> + struct phylink_notifier_info info = { + .link_an_mode = pl->link_an_mode, + /* Discard const pointer */ + .state = (struct phylink_link_state *)state, + }; + netdev_dbg(pl->netdev, "%s: mode=%s/%s/%s/%s adv=%*pb pause=%02x link=%u >> an=%u\n", __func__, phylink_an_mode_str(pl->link_an_mode), @@ -299,7 +317,12 @@ static void phylink_mac_config(struct phylink *pl, __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising, state->pause, state->link, state->an_enabled); >>> >>> Don't you need to guard that netdev_dbg() with an if (pl->ops) to >>> avoid de-referencing a NULL net_device? >>> > > > The netdev_* print will not dereference a NULL net_device since it has > explicit checks agains this. > Instead it will just print (net/core/dev.c, __netdev_printk): > > printk("%s(NULL net_device): %pV", level, vaf); > > >>> Another possibility could be to change the signature of the >>> phylink_mac_ops to take an opaque pointer and in the case where we >>> called phylink_create() and passed down a net_device pointer, we >>> somehow remember that for doing any operation that requires a >>> net_device (printing, setting carrier). We lose strict typing in doing >>> that, but we'd have fewer places to patch for a blocking notifier call. >>> >> >> Or even make those functions part of phylink_mac_ops such that the caller >> could pass an .carrier_ok callback which is netif_carrier_ok() for a >> net_device, >> else it's NULL, same with printing functions if desired... >> -- >> Florian > > > Let me see if I understood this correctly. I presume that any API that we add > should not break any current PHYLINK users. > > You suggest to change the prototype of the phylink_mac_ops from > > void (*validate)(struct net_device *ndev, unsigned long *supported, >struct phylink_link_state *state); > > to something that takes a void pointer: > > void (*validate)(void *dev, unsigned long *supported, >struct phylink_link_state *state); That is what I am suggesting, but I am also suggesting passing all netdev specific calls that must be made as callbacks as well, so something like: bool (*carrier_ok)(const void *dev) void (*carrier_set)(const void *dev, bool on) void (*print)(const void *dev, const char *fmt) as new members of phylink_mac_ops. > > This would imply that the any function in PHYLINK would have to somehow > differentiate if the dev provided is indeed a net_device or another structure > in order to make the decision if netif_carrier_off should be called or not > (this is so we do not break any drivers using PHYLINK). I cannot see how this > judgement can be made. You don't have to make the judgement you can just do: if (pl->ops->carrier_set) pl->ops->carrier_set(dev, where dev was this opaque pointer passed to phylink_create() the first time it was created. Like I wrote, we lose strong typing doing that, but we don't have to update all code paths for if (pl->ops) else notifier. -- Florian
Re: [PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options
On Thu, May 23, 2019 at 11:56:45AM +0200, Maxime Ripard wrote: > The networking PHYs have a number of available device tree properties that > can be used in their device tree node. Add a YAML schemas for those. > > Signed-off-by: Maxime Ripard > --- > Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +- > Documentation/devicetree/bindings/net/phy.txt | 80 +- > 2 files changed, 149 insertions(+), 79 deletions(-) > create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml > > diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml > b/Documentation/devicetree/bindings/net/ethernet-phy.yaml > new file mode 100644 > index ..eb79ee6db977 > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml > @@ -0,0 +1,148 @@ > +# SPDX-License-Identifier: GPL-2.0 > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/ethernet-phy.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Ethernet PHY Generic Binding > + > +maintainers: > + - David S. Miller > + > +properties: > + $nodename: > +pattern: "^ethernet-phy(@[a-f0-9])?$" > + > + compatible: > +oneOf: I don't know the language. It is valid to have both ethernet-phy-ieee802.3-c45 and ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$". Does this oneOf prevent multiple compatible strings? Also, the general case is no compatible at all. > + - const: ethernet-phy-ieee802.3-c22 > +description: PHYs that implement IEEE802.3 clause 22 > + - const: ethernet-phy-ieee802.3-c45 > +description: PHYs that implement IEEE802.3 clause 45 > + - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$" > +description: > + The first group of digits is the 16 bit Phy Identifier 1 > + register, this is the chip vendor OUI bits 3:18. The > + second group of digits is the Phy Identifier 2 register, > + this is the chip vendor OUI bits 19:24, followed by 10 > + bits of a vendor specific ID. Could we try to retain: > - If the PHY reports an incorrect ID (or none at all) then the > - "compatible" list may contain an entry with the correct PHY ID in the ... Using it is generally wrong, and that is not clear in the new text. > + > + reg: > +maxItems: 1 > +minimum: 0 > +maximum: 31 > +description: > + The ID number for the PHY. > + > + interrupts: > +maxItems: 1 > + > + max-speed: > +enum: > + - 10 > + - 100 > + - 1000 This is outdated in the text description. Any valid speed is supported, currently 10 - 20, as listed in phy_setting settings(). Andrew
[PATCH net-next] Revert "dpaa2-eth: configure the cache stashing amount on a queue"
This reverts commit f8b995853444aba9c16c1ccdccdd397527fde96d. The reverted change instructed the QMan hardware block to fetch RX frame annotation and beginning of frame data to cache before the core would read them. It turns out that in rare cases, it's possible that a QMan stashing transaction is delayed long enough such that, by the time it gets executed, the frame in question had already been dequeued by the core and software processing began on it. If the core manages to unmap the frame buffer _before_ the stashing transaction is executed, an SMMU exception will be raised. Unfortunately there is no easy way to work around this while keeping the performance advantages brought by QMan stashing, so disable it altogether. Signed-off-by: Ioana Radulescu --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +-- 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 63b1ecc1..28a6faa 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2479,14 +2479,9 @@ static int setup_rx_flow(struct dpaa2_eth_priv *priv, queue.destination.type = DPNI_DEST_DPCON; queue.destination.priority = 1; queue.user_context = (u64)(uintptr_t)fq; - queue.flc.stash_control = 1; - queue.flc.value &= 0xFFC0; - /* 01 01 00 - data, annotation, flow context */ - queue.flc.value |= 0x14; err = dpni_set_queue(priv->mc_io, 0, priv->mc_token, DPNI_QUEUE_RX, 0, fq->flowid, -DPNI_QUEUE_OPT_USER_CTX | DPNI_QUEUE_OPT_DEST | -DPNI_QUEUE_OPT_FLC, +DPNI_QUEUE_OPT_USER_CTX | DPNI_QUEUE_OPT_DEST, &queue); if (err) { dev_err(dev, "dpni_set_queue(RX) failed\n"); -- 2.7.4
Re: [PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options
> > +link-gpios: > > + description: > > +GPIO to determine if the link is up > > Only 1? Hi Rob Yes, only one. Andrew
Re: [PATCH v2 bpf-next 0/3] bpf: increase jmp sequence limit
On 05/22/2019 05:14 AM, Alexei Starovoitov wrote: > Patch 1 - jmp sequence limit > Patch 2 - improve existing tests > Patch 3 - add pyperf-based realistic bpf program that takes advantage > of higher limit and use it as a stress test > > v1->v2: fixed nit in patch 3. added Andrii's acks > > Alexei Starovoitov (3): > bpf: bump jmp sequence limit > selftests/bpf: adjust verifier scale test > selftests/bpf: add pyperf scale test > > kernel/bpf/verifier.c | 7 +- > .../bpf/prog_tests/bpf_verif_scale.c | 31 +- > tools/testing/selftests/bpf/progs/pyperf.h| 268 ++ > tools/testing/selftests/bpf/progs/pyperf100.c | 4 + > tools/testing/selftests/bpf/progs/pyperf180.c | 4 + > tools/testing/selftests/bpf/progs/pyperf50.c | 4 + > tools/testing/selftests/bpf/test_verifier.c | 31 +- > 7 files changed, 319 insertions(+), 30 deletions(-) > create mode 100644 tools/testing/selftests/bpf/progs/pyperf.h > create mode 100644 tools/testing/selftests/bpf/progs/pyperf100.c > create mode 100644 tools/testing/selftests/bpf/progs/pyperf180.c > create mode 100644 tools/testing/selftests/bpf/progs/pyperf50.c > Looks good, applied, thanks!
Re: [PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options
On Thu, May 23, 2019 at 4:57 AM Maxime Ripard wrote: > > The networking PHYs have a number of available device tree properties that > can be used in their device tree node. Add a YAML schemas for those. > > Signed-off-by: Maxime Ripard > --- > Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +- > Documentation/devicetree/bindings/net/phy.txt | 80 +- > 2 files changed, 149 insertions(+), 79 deletions(-) > create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml > > diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml > b/Documentation/devicetree/bindings/net/ethernet-phy.yaml > new file mode 100644 > index ..eb79ee6db977 > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml > @@ -0,0 +1,148 @@ > +# SPDX-License-Identifier: GPL-2.0 > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/ethernet-phy.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Ethernet PHY Generic Binding > + > +maintainers: > + - David S. Miller > + > +properties: > + $nodename: > +pattern: "^ethernet-phy(@[a-f0-9])?$" > + > + compatible: > +oneOf: > + - const: ethernet-phy-ieee802.3-c22 > +description: PHYs that implement IEEE802.3 clause 22 > + - const: ethernet-phy-ieee802.3-c45 > +description: PHYs that implement IEEE802.3 clause 45 > + - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$" > +description: > + The first group of digits is the 16 bit Phy Identifier 1 > + register, this is the chip vendor OUI bits 3:18. The > + second group of digits is the Phy Identifier 2 register, > + this is the chip vendor OUI bits 19:24, followed by 10 > + bits of a vendor specific ID. > + > + reg: > +maxItems: 1 > +minimum: 0 > +maximum: 31 min/max need to be under 'items'. I don't think these would ever be valid if the type is an array. I've modified the meta-schema to catch this. > +description: > + The ID number for the PHY. > + > + interrupts: > +maxItems: 1 > + > + max-speed: > +enum: > + - 10 > + - 100 > + - 1000 > +description: > + Maximum PHY supported speed in Mbits / seconds. > + > + broken-turn-around: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + If set, indicates the PHY device does not correctly release > + the turn around line low at the end of a MDIO transaction. > + > + enet-phy-lane-swap: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + If set, indicates the PHY will swap the TX/RX lanes to > + compensate for the board being designed with the lanes > + swapped. > + > + eee-broken-100tx: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + eee-broken-1000t: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + eee-broken-10gt: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + eee-broken-1000kx: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + eee-broken-10gkx4: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + eee-broken-10gkr: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + Mark the corresponding energy efficient ethernet mode as > + broken and request the ethernet to stop advertising it. > + > + phy-is-integrated: > +$ref: /schemas/types.yaml#definitions/flag > +description: > + If set, indicates that the PHY is integrated into the same > + physical package as the Ethernet MAC. If needed, muxers > + should be configured to ensure the integrated PHY is > + used. The absence of this property indicates the muxers > + should be configured so that the external PHY is used. > + > + resets: > +maxItems: 1 > + > + reset-names: > +const: phy > + > + reset-gpios: > +description: > + The GPIO phandle and specifier for the PHY reset signal. maxItems: 1 I have a meta-schema change to catch this, but It will require updates to some existing cases. > + > + reset-assert-us: > +description: > + Delay after the reset was asserted in microseconds. If this > + property is missing
Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
On Thu, May 23, 2019 at 11:25:09AM +, Jose Abreu wrote: > From: Maxime Ripard > Date: Thu, May 23, 2019 at 12:07:15 > > > You can then run make dtbs_check, and those YAML files will be used to > > validate that any devicetree using those properties are doing it > > properly. That implies having the right node names, properties, types, > > ranges of values when relevant, and so on. > > Thanks but how can one that's developing know which bindings it shall use? I'm not quite sure what you mean here. Are you talking about which file to use, or which property are required, or something else? > Is this not parsed/prettified and displayed in some kind of webpage ? Not at the moment, but it's one of the things that are made much easier by using a formal data format. > Just that now that the TXT is gone its kind of "strange" to look at YAML > instead of plain text and develop/use the bindings. Well, it's kind of the point though. Free-form text was impossible to parse in a generic way, and you couldn't build any generic tools upon it. YAML provides that. Maxime -- Maxime Ripard, Bootlin Embedded Linux and Kernel engineering https://bootlin.com signature.asc Description: PGP signature
Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
Hi Maxime On 5/23/19 11:56 AM, Maxime Ripard wrote: Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML schema to enable the DT validation. Signed-off-by: Maxime Ripard --- First, thanks a lot for this patch. Just one question: We could add ranges for some properties in order to avoid "bad value" for a property. If I understand correctly you do it only for snps,dwxgmac, snps,dwxgmac-2.10 and st,spear600-gmac. Why not do it for all supported IPs ? (Maybe it is something that we could add later) Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++- Documentation/devicetree/bindings/net/stmmac.txt | 179 +-- 2 files changed, 345 insertions(+), 178 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml new file mode 100644 index ..be3ada5121e1 --- /dev/null +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -0,0 +1,344 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/snps,dwmac.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Synopsys DesignWare MAC Device Tree Bindings + +maintainers: + - Alexandre Torgue + - Giuseppe Cavallaro + - Jose Abreu + +properties: + compatible: +oneOf: + - const: snps,dwmac + - const: snps,dwmac-3.50a + - const: snps,dwmac-3.610 + - const: snps,dwmac-3.70a + - const: snps,dwmac-3.710 + - const: snps,dwmac-4.00 + - const: snps,dwmac-4.10a + - const: snps,dwxgmac + - const: snps,dwxgmac-2.10 + - const: st,spear600-gmac +description: Deprecated + + reg: +maxItems: 1 + + interrupts: +minItems: 1 +maxItems: 3 +items: + - description: Combined signal for various interrupt events + - description: The interrupt to manage the remote wake-up packet detection + - description: The interrupt that occurs when Rx exits the LPI state + + interrupt-names: +minItems: 1 +maxItems: 3 +items: + - const: macirq + - const: eth_wake_irq + - const: eth_lpi + + clocks: +minItems: 1 +maxItems: 3 +items: + - description: GMAC main clock + - description: Peripheral registers interface clock + - description: + PTP reference clock. This clock is used for programming the + Timestamp Addend Register. If not passed then the system + clock will be used and this is fine on some platforms. + + clock-names: +additionalItems: true +contains: + enum: +- stmmaceth +- pclk +- ptp_ref + + resets: +maxItems: 1 +description: + MAC Reset signal. + + reset-names: +const: stmmaceth + + snps,axi-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + AXI BUS Mode parameters. Phandle to a node that can contain the + following properties +* snps,lpi_en, enable Low Power Interface +* snps,xit_frm, unlock on WoL +* snps,wr_osr_lmt, max write outstanding req. limit +* snps,rd_osr_lmt, max read outstanding req. limit +* snps,kbbe, do not cross 1KiB boundary. +* snps,blen, this is a vector of supported burst length. +* snps,fb, fixed-burst +* snps,mb, mixed-burst +* snps,rb, rebuild INCRx Burst + + snps,mtl-rx-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + Multiple RX Queues parameters. Phandle to a node that can + contain the following properties +* snps,rx-queues-to-use, number of RX queues to be used in the + driver +* Choose one of these RX scheduling algorithms + * snps,rx-sched-sp, Strict priority + * snps,rx-sched-wsp, Weighted Strict priority +* For each RX queue + * Choose one of these modes +* snps,dcb-algorithm, Queue to be enabled as DCB +* snps,avb-algorithm, Queue to be enabled as AVB + * snps,map-to-dma-channel, Channel to map + * Specifiy specific packet routing +* snps,route-avcp, AV Untagged Control packets +* snps,route-ptp, PTP Packets +* snps,route-dcbcp, DCB Control Packets +* snps,route-up, Untagged Packets +* snps,route-multi-broad, Multicast & Broadcast Packets + * snps,priority, RX queue priority (Range 0x0 to 0xF) + + snps,mtl-tx-config: +$ref: /schemas/types.yaml#definitions/phandle +description: + Multiple TX Queues parameters. Phandle to a node that can + contain the following properties +* snps,tx-queues-to-use, number of TX queues to be used in the + driver +* Choose one of these TX scheduling algorithms + * snps,tx-sched-wrr, Weighted Round Robin + * snps,tx-sched-wfq, Weighted
[PATCH net] cxgb4: offload VLAN flows regardless of VLAN ethtype
VLAN flows never get offloaded unless ivlan_vld is set in filter spec. It's not compulsory for vlan_ethtype to be set. So, always enable ivlan_vld bit for offloading VLAN flows regardless of vlan_ethtype is set or not. Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan) Signed-off-by: Raju Rangoju --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index 6e2d80008a79..cfaf8f618d1f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -197,6 +197,9 @@ static void cxgb4_process_flow_match(struct net_device *dev, fs->val.ivlan = vlan_tci; fs->mask.ivlan = vlan_tci_mask; + fs->val.ivlan_vld = 1; + fs->mask.ivlan_vld = 1; + /* Chelsio adapters use ivlan_vld bit to match vlan packets * as 802.1Q. Also, when vlan tag is present in packets, * ethtype match is used then to match on ethtype of inner @@ -207,8 +210,6 @@ static void cxgb4_process_flow_match(struct net_device *dev, * ethtype value with ethtype of inner header. */ if (fs->val.ethtype == ETH_P_8021Q) { - fs->val.ivlan_vld = 1; - fs->mask.ivlan_vld = 1; fs->val.ethtype = 0; fs->mask.ethtype = 0; } -- 2.9.5
Re: [RFC PATCH net-next 0/9] Decoupling PHYLINK from struct net_device
Hello Ioana, On Thu, 23 May 2019 01:20:36 + Ioana Ciornei wrote: >Following two separate discussion threads in: > https://www.spinics.net/lists/netdev/msg569087.html >and: > https://www.spinics.net/lists/netdev/msg570450.html > >PHYLINK was reworked in order to add a new "raw" interface, alongside >the one based on struct net_device. The raw interface works by passing >structures to the owner of the phylink instance through a blocking >notifier call. > >The event API exposed by the new notifier mechanism is a 1:1 mapping to >the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback. > >PHYLIB (which PHYLINK uses) was reworked to the extent that it does not >crash when connecting to a PHY and the net_device pointer is NULL. > >Lastly, DSA has been reworked in its way that it handles PHYs for ports >that lack a net_device (CPU and DSA ports). For these, it was >previously using PHYLIB and is now using the PHYLINK raw API. Thanks for this series, I've tested it and it does address the particular issue I having, with the DSA fixed-link CPU port. I've been able to use 2500 and 1 Mbps links with your series, on a board using a 88e6390X (so, the mv88e6xxx driver). I've also tested it on boards using PPv2, which uses phylink, and didn't spot any issue, so this looks fine. I'll be happy to test future versions, Thanks, Maxime
Re: Bug or mis configuration for mlx5e lag and multipath
On 20/05/2019 04:53, wenxu wrote: > Hi Roi & Saeed, > > I just test the mlx5e lag and mutipath feature. There are some suituation the > outgoing can't be offloaded. > > ovs configureation as following. > > # ovs-vsctl show > dfd71dfb-6e22-423e-b088-d2022103af6b > Bridge "br0" > Port "mlx_pf0vf0" > Interface "mlx_pf0vf0" > Port gre > Interface gre > type: gre > options: {key="1000", local_ip="172.168.152.75", > remote_ip="172.168.152.241"} > Port "br0" > Interface "br0" > type: internal > > set the mlx5e driver: > > > modprobe mlx5_core > echo 0 > /sys/class/net/eth2/device/sriov_numvfs > echo 0 > /sys/class/net/eth3/device/sriov_numvfs > echo 2 > /sys/class/net/eth2/device/sriov_numvfs > echo 2 > /sys/class/net/eth3/device/sriov_numvfs > lspci -nn | grep Mellanox > echo :81:00.2 > /sys/bus/pci/drivers/mlx5_core/unbind > echo :81:00.3 > /sys/bus/pci/drivers/mlx5_core/unbind > echo :81:03.6 > /sys/bus/pci/drivers/mlx5_core/unbind > echo :81:03.7 > /sys/bus/pci/drivers/mlx5_core/unbind > > devlink dev eswitch set pci/:81:00.0 mode switchdev encap enable > devlink dev eswitch set pci/:81:00.1 mode switchdev encap enable > > modprobe bonding mode=802.3ad miimon=100 lacp_rate=1 > ip l del dev bond0 > ifconfig mlx_p0 down > ifconfig mlx_p1 down > ip l add dev bond0 type bond mode 802.3ad > ifconfig bond0 172.168.152.75/24 up > echo 1 > /sys/class/net/bond0/bonding/xmit_hash_policy > ip l set dev mlx_p0 master bond0 > ip l set dev mlx_p1 master bond0 > ifconfig mlx_p0 up > ifconfig mlx_p1 up > > systemctl start openvswitch > ovs-vsctl set Open_vSwitch . other_config:hw-offload=true > systemctl restart openvswitch > > > mlx_pf0vf0 is assigned to vm. The tc rule show in_hw > > # tc filter ls dev mlx_pf0vf0 ingress > filter protocol ip pref 2 flower > filter protocol ip pref 2 flower handle 0x1 > dst_mac 8e:c0:bd:bf:72:c3 > src_mac 52:54:00:00:12:75 > eth_type ipv4 > ip_tos 0/3 > ip_flags nofrag > in_hw > action order 1: tunnel_key set > src_ip 172.168.152.75 > dst_ip 172.168.152.241 > key_id 1000 pipe > index 2 ref 1 bind 1 > > action order 2: mirred (Egress Redirect to device gre_sys) stolen > index 2 ref 1 bind 1 > > In the vm: the mlx5e driver enable xps default (by the way I think it is > better not enable xps in default kernel can select queue by each flow), in > the lag mode different vf queue associate with hw PF. > > with command taskset -c 2 ping 10.0.0.241 > > the packet can be offloaded , the outgoing pf is mlx_p0 > > but with command taskset -c 1 ping 10.0.0.241 > > the packet can't be offloaded, I can capture the packet on the mlx_pf0vf0, > the outgoing pf is mlx_p1. Althrough the tc flower rule show in_hw > > > I check with the driver both mlx_pf0vf0 and peer(mlx_p1) install the tc rule > success > > I think it's a problem of lag mode. Or I miss some configureation? > > > BR > > wenxu > > > > > Hi, we need to verify the driver detected to be in lag mode and duplicated the offload rule to both eswitches. do you see lag map messages in dmesg? something like "lag map port 1:1 port 2:2" this is to make sure the driver actually in lag mode. in this mode a rule added to mlx_pf0vf0 will be added to esw of pf0 and esw of pf1. then when u send a packet it could be handled in esw0 or esw1 if the rule is not in esw1 then it wont be offloaded when using pf1. thanks, Roi
Re: [patch net-next 3/7] mlxfw: Propagate error messages through extack
On 5/23/19 3:45 AM, Jiri Pirko wrote: > @@ -57,11 +58,13 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev > *mlxfw_dev, u32 fwhandle, > if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) { > pr_err("Firmware flash failed: %s\n", > mlxfw_fsm_state_err_str[fsm_state_err]); > + NL_SET_ERR_MSG_MOD(extack, "Firmware flash failed"); > return -EINVAL; > } > if (curr_fsm_state != fsm_state) { > if (--times == 0) { > pr_err("Timeout reached on FSM state change"); > + NL_SET_ERR_MSG_MOD(extack, "Timeout reached on FSM > state change"); FSM? Is the meaning obvious to users? > return -ETIMEDOUT; > } > msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS); > @@ -76,7 +79,8 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev > *mlxfw_dev, u32 fwhandle, > > static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev, >u32 fwhandle, > - struct mlxfw_mfa2_component *comp) > + struct mlxfw_mfa2_component *comp, > + struct netlink_ext_ack *extack) > { > u16 comp_max_write_size; > u8 comp_align_bits; > @@ -96,6 +100,7 @@ static int mlxfw_flash_component(struct mlxfw_dev > *mlxfw_dev, > if (comp->data_size > comp_max_size) { > pr_err("Component %d is of size %d which is bigger than limit > %d\n", > comp->index, comp->data_size, comp_max_size); > + NL_SET_ERR_MSG_MOD(extack, "Component is which is bigger than > limit"); Need to drop 'is which'. ... > @@ -156,6 +163,7 @@ static int mlxfw_flash_components(struct mlxfw_dev > *mlxfw_dev, u32 fwhandle, > &component_count); > if (err) { > pr_err("Could not find device PSID in MFA2 file\n"); > + NL_SET_ERR_MSG_MOD(extack, "Could not find device PSID in MFA2 > file"); same here, is PSID understood by user?
bonding-devel mail list?
Hello, Noted the old bonding-devel mail list at sourceforge is no more, is there an alternate? Chasing whether a bond of bonds has an issue my testing hasn't revealed. Thanks, -- Bill Carlson Anything is possible, given Time and Money.
Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas
On Thu, May 23, 2019 at 4:57 AM Maxime Ripard wrote: > > Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML > schema to enable the DT validation. You picked an easy one. ;) > > Signed-off-by: Maxime Ripard > --- > Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++- > Documentation/devicetree/bindings/net/stmmac.txt | 179 +-- > 2 files changed, 345 insertions(+), 178 deletions(-) > create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml > > diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml > b/Documentation/devicetree/bindings/net/snps,dwmac.yaml > new file mode 100644 > index ..be3ada5121e1 > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml > @@ -0,0 +1,344 @@ > +# SPDX-License-Identifier: GPL-2.0 > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/net/snps,dwmac.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: Synopsys DesignWare MAC Device Tree Bindings > + > +maintainers: > + - Alexandre Torgue > + - Giuseppe Cavallaro > + - Jose Abreu > + > +properties: > + compatible: > +oneOf: > + - const: snps,dwmac > + - const: snps,dwmac-3.50a > + - const: snps,dwmac-3.610 > + - const: snps,dwmac-3.70a > + - const: snps,dwmac-3.710 > + - const: snps,dwmac-4.00 > + - const: snps,dwmac-4.10a > + - const: snps,dwxgmac > + - const: snps,dwxgmac-2.10 > + - const: st,spear600-gmac > +description: Deprecated Like the other, just make this an enum. Though, what to do on deprecated things? If we expect dts files to be updated, then we should remove or disallow in the schema (e.g. 'prop: false' for properties). The issue with updating dts files, is it may break old kernels with new dtbs. > + > + reg: > +maxItems: 1 > + > + interrupts: > +minItems: 1 > +maxItems: 3 > +items: > + - description: Combined signal for various interrupt events > + - description: The interrupt to manage the remote wake-up packet > detection > + - description: The interrupt that occurs when Rx exits the LPI state > + > + interrupt-names: > +minItems: 1 > +maxItems: 3 > +items: > + - const: macirq > + - const: eth_wake_irq > + - const: eth_lpi > + > + clocks: > +minItems: 1 > +maxItems: 3 > +items: > + - description: GMAC main clock > + - description: Peripheral registers interface clock > + - description: > + PTP reference clock. This clock is used for programming the > + Timestamp Addend Register. If not passed then the system > + clock will be used and this is fine on some platforms. > + > + clock-names: > +additionalItems: true That's not ideal, but what I'd expect on this one. > +contains: > + enum: > +- stmmaceth > +- pclk > +- ptp_ref > + > + resets: > +maxItems: 1 > +description: > + MAC Reset signal. > + > + reset-names: > +const: stmmaceth > + > + snps,axi-config: > +$ref: /schemas/types.yaml#definitions/phandle > +description: > + AXI BUS Mode parameters. Phandle to a node that can contain the > + following properties > +* snps,lpi_en, enable Low Power Interface > +* snps,xit_frm, unlock on WoL > +* snps,wr_osr_lmt, max write outstanding req. limit > +* snps,rd_osr_lmt, max read outstanding req. limit > +* snps,kbbe, do not cross 1KiB boundary. > +* snps,blen, this is a vector of supported burst length. > +* snps,fb, fixed-burst > +* snps,mb, mixed-burst > +* snps,rb, rebuild INCRx Burst This obviously needs its own schema, but that can come latter. > + > + snps,mtl-rx-config: > +$ref: /schemas/types.yaml#definitions/phandle > +description: > + Multiple RX Queues parameters. Phandle to a node that can > + contain the following properties > +* snps,rx-queues-to-use, number of RX queues to be used in the > + driver > +* Choose one of these RX scheduling algorithms > + * snps,rx-sched-sp, Strict priority > + * snps,rx-sched-wsp, Weighted Strict priority > +* For each RX queue > + * Choose one of these modes > +* snps,dcb-algorithm, Queue to be enabled as DCB > +* snps,avb-algorithm, Queue to be enabled as AVB > + * snps,map-to-dma-channel, Channel to map > + * Specifiy specific packet routing > +* snps,route-avcp, AV Untagged Control packets > +* snps,route-ptp, PTP Packets > +* snps,route-dcbcp, DCB Control Packets > +* snps,route-up, Untagged Packets > +* snps,route-multi-broad, Multicast & Broadcast Packets > + * snps,priority, RX queue priority (Range 0x0 to 0xF) This too. > + > + snps,mtl-tx-config: > +$ref: /schemas/types.yaml#definitions/phan
Re: [RFC] vsock: proposal to support multiple transports at runtime
On Tue, May 14, 2019 at 10:15:43AM +0200, Stefano Garzarella wrote: > Hi guys, > I'm currently interested on implement a multi-transport support for VSOCK in > order to handle nested VMs. > > As Stefan suggested me, I started to look at this discussion: > https://lkml.org/lkml/2017/8/17/551 > Below I tried to summarize a proposal for a discussion, following the ideas > from Dexuan, Jorgen, and Stefan. > > > We can define two types of transport that we have to handle at the same time > (e.g. in a nested VM we would have both types of transport running together): > > - 'host side transport', it runs in the host and it is used to communicate > with > the guests of a specific hypervisor (KVM, VMWare or HyperV) > > Should we support multiple 'host side transport' running at the same time? > > - 'guest side transport'. it runs in the guest and it is used to communicate > with the host transport I find this terminology confusing. Perhaps "host->guest" (your 'host side transport') and "guest->host" (your 'guest side transport') is clearer? Or maybe the nested virtualization terminology of L2 transport (your 'host side transport') and L0 transport (your 'guest side transport')? Here we are the L1 guest and L0 is the host and L2 is our nested guest. > > > The main goal is to find a way to decide what transport use in these cases: > 1. connect() / sendto() > > a. use the 'host side transport', if the destination is the guest > (dest_cid > VMADDR_CID_HOST). > If we want to support multiple 'host side transport' running at the > same time, we should assign CIDs uniquely across all transports. > In this way, a packet generated by the host side will get directed > to the appropriate transport based on the CID The multiple host side transport case is unlikely to be necessary on x86 where only one hypervisor uses VMX at any given time. But eventually it may happen so it's wise to at least allow it in the design. > > b. use the 'guest side transport', if the destination is the host > (dest_cid == VMADDR_CID_HOST) Makes sense to me. > > > 2. listen() / recvfrom() > > a. use the 'host side transport', if the socket is bound to > VMADDR_CID_HOST, or it is bound to VMADDR_CID_ANY and there is no > guest transport. > We could also define a new VMADDR_CID_LISTEN_FROM_GUEST in order to > address this case. > If we want to support multiple 'host side transport' running at the > same time, we should find a way to allow an application to bound a > specific host transport (e.g. adding new VMADDR_CID_LISTEN_FROM_KVM, > VMADDR_CID_LISTEN_FROM_VMWARE, VMADDR_CID_LISTEN_FROM_HYPERV) Hmm...VMADDR_CID_LISTEN_FROM_KVM, VMADDR_CID_LISTEN_FROM_VMWARE, VMADDR_CID_LISTEN_FROM_HYPERV isn't very flexible. What if my service should only be available to a subset of VMware VMs? Instead it might be more appropriate to use network namespaces to create independent AF_VSOCK addressing domains. Then you could have two separate groups of VMware VMs and selectively listen to just one group. > > b. use the 'guest side transport', if the socket is bound to local CID > different from the VMADDR_CID_HOST (guest CID get with > IOCTL_VM_SOCKETS_GET_LOCAL_CID), or it is bound to VMADDR_CID_ANY > (to be backward compatible). > Also in this case, we could define a new VMADDR_CID_LISTEN_FROM_HOST. Two additional topics: 1. How will loading af_vsock.ko change? In particular, can an application create a socket in af_vsock.ko without any loaded transport? Can it enter listen state without any loaded transport (this seems useful with VMADDR_CID_ANY)? 2. Does your proposed behavior match VMware's existing nested vsock semantics? signature.asc Description: PGP signature
Re: [PATCH bpf-next v2 1/3] bpf: implement bpf_send_signal() helper
On 05/22/2019 07:39 AM, Yonghong Song wrote: > This patch tries to solve the following specific use case. > > Currently, bpf program can already collect stack traces > through kernel function get_perf_callchain() > when certain events happens (e.g., cache miss counter or > cpu clock counter overflows). But such stack traces are > not enough for jitted programs, e.g., hhvm (jited php). > To get real stack trace, jit engine internal data structures > need to be traversed in order to get the real user functions. > > bpf program itself may not be the best place to traverse > the jit engine as the traversing logic could be complex and > it is not a stable interface either. > > Instead, hhvm implements a signal handler, > e.g. for SIGALARM, and a set of program locations which > it can dump stack traces. When it receives a signal, it will > dump the stack in next such program location. > > Such a mechanism can be implemented in the following way: > . a perf ring buffer is created between bpf program > and tracing app. > . once a particular event happens, bpf program writes > to the ring buffer and the tracing app gets notified. > . the tracing app sends a signal SIGALARM to the hhvm. > > But this method could have large delays and causing profiling > results skewed. > > This patch implements bpf_send_signal() helper to send > a signal to hhvm in real time, resulting in intended stack traces. > > Signed-off-by: Yonghong Song > --- > include/uapi/linux/bpf.h | 17 +- > kernel/trace/bpf_trace.c | 67 > 2 files changed, 83 insertions(+), 1 deletion(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 63e0cf66f01a..68d4470523a0 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2672,6 +2672,20 @@ union bpf_attr { > * 0 on success. > * > * **-ENOENT** if the bpf-local-storage cannot be found. > + * > + * int bpf_send_signal(u32 sig) > + * Description > + * Send signal *sig* to the current task. > + * Return > + * 0 on success or successfully queued. > + * > + * **-EBUSY** if work queue under nmi is full. > + * > + * **-EINVAL** if *sig* is invalid. > + * > + * **-EPERM** if no permission to send the *sig*. > + * > + * **-EAGAIN** if bpf program can try again. > */ > #define __BPF_FUNC_MAPPER(FN)\ > FN(unspec), \ > @@ -2782,7 +2796,8 @@ union bpf_attr { > FN(strtol), \ > FN(strtoul),\ > FN(sk_storage_get), \ > - FN(sk_storage_delete), > + FN(sk_storage_delete), \ > + FN(send_signal), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index f92d6ad5e080..f8cd0db7289f 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -567,6 +567,58 @@ static const struct bpf_func_proto > bpf_probe_read_str_proto = { > .arg3_type = ARG_ANYTHING, > }; > > +struct send_signal_irq_work { > + struct irq_work irq_work; > + u32 sig; > +}; > + > +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); > + > +static void do_bpf_send_signal(struct irq_work *entry) > +{ > + struct send_signal_irq_work *work; > + > + work = container_of(entry, struct send_signal_irq_work, irq_work); > + group_send_sig_info(work->sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); > +} > + > +BPF_CALL_1(bpf_send_signal, u32, sig) > +{ > + struct send_signal_irq_work *work = NULL; > + > + /* Similar to bpf_probe_write_user, task needs to be > + * in a sound condition and kernel memory access be > + * permitted in order to send signal to the current > + * task. > + */ > + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) > + return -EPERM; > + if (unlikely(uaccess_kernel())) > + return -EPERM; > + if (unlikely(!nmi_uaccess_okay())) > + return -EPERM; > + > + if (in_nmi()) { Hm, bit confused, can't this only be done out of process context in general since only there current points to e.g. hhvm? I'm probably missing something. Could you elaborate? > + work = this_cpu_ptr(&send_signal_work); > + if (work->irq_work.flags & IRQ_WORK_BUSY) > + return -EBUSY; > + > + work->sig = sig; > + irq_work_queue(&work->irq_work); > + return 0; > + } > + > + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); > + Nit: extra newline slipped in > +} > + > +static const struct bpf_func_proto bpf_send_signal_proto = { > + .func = bpf_send_signal, > + .gpl_only = false, > + .ret_type = RET_INTEGER, > +
Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names
On 5/23/19 1:58 AM, Stefano Brivio wrote: > Hi David, > > On Wed, 22 May 2019 12:11:06 -0700 > David Ahern wrote: > >> From: David Ahern >> >> The point of the pause-on-fail argument is to leave the setup as is after >> a test fails to allow a user to debug why it failed. Move the cleanup >> after posting the result to the user to make it so. >> >> Random names for the namespaces are not user friendly when trying to >> debug a failure. Make them simpler and more direct for the tests. Run >> cleanup at the beginning to ensure they are cleaned up if they already >> exist. > > The reasons for picking per-instance unique names were: > > - one can run multiple instances of the script in parallel. I > couldn't trigger any bug this way *so far*, though > > - cleanup might fail because of e.g. device reference count leaks (this > happened quite frequently in the past), which are anyway visible in > kernel logs. Unique names avoid the need to reboot > > Sure, it's a trade-off with usability, and I also see the value of > having fixed names, so I'm fine with this too. I just wanted to make > sure you considered these points. > > By the way, the comment to nsname() (that I would keep, it's still > somewhat convenient) is now inconsistent. > I have been using the namespace override for a while now. I did consider impacts to the above, but my thinking is this: exceptions are per FIB entry (per fib6_nh with my latest patch set, but point still holds), FIB entries are per FIB table, FIB tables are per network namespace. Running multiple pmtu.sh sessions in parallel can not trigger an interdependent bug because of that separation. The cleanup within a namespace teardown (reference count leaks) should not be affected. Now that we have good set of functional tests, we do need more complex tests but those will still be contained within the namespace separation. If you look at my current patch set on the list I add an icmp_redirect test script. It actually does redirect, verify, mtu on top of redirect, verify and then resets and inverts the order - going after an exception entry with an update for both use cases. For the pmtu script, perhaps the next step is something as simple as configuring the setup and routing once and then run all of the individual tests (or multiple of them) to generate multiple exceptions within a single FIB table and then tests to generate multiple exceptions with different addresses per entry.
[PATCH] bpf: sockmap, fix use after free from sleep in psock backlog workqueue
Backlog work for psock (sk_psock_backlog) might sleep while waiting for memory to free up when sending packets. However, while sleeping the socket may be closed and removed from the map by the user space side. This breaks an assumption in sk_stream_wait_memory, which expects the wait queue to be still there when it wakes up resulting in a use-after-free shown below. To fix his mark sendmsg as MSG_DONTWAIT to avoid the sleep altogether. We already set the flag for the sendpage case but we missed the case were sendmsg is used. Sockmap is currently the only user of skb_send_sock_locked() so only the sockmap paths should be impacted. == BUG: KASAN: use-after-free in remove_wait_queue+0x31/0x70 Write of size 8 at addr 888069a0c4e8 by task kworker/0:2/110 CPU: 0 PID: 110 Comm: kworker/0:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 Workqueue: events sk_psock_backlog Call Trace: print_address_description+0x6e/0x2b0 ? remove_wait_queue+0x31/0x70 kasan_report+0xfd/0x177 ? remove_wait_queue+0x31/0x70 ? remove_wait_queue+0x31/0x70 remove_wait_queue+0x31/0x70 sk_stream_wait_memory+0x4dd/0x5f0 ? sk_stream_wait_close+0x1b0/0x1b0 ? wait_woken+0xc0/0xc0 ? tcp_current_mss+0xc5/0x110 tcp_sendmsg_locked+0x634/0x15d0 ? tcp_set_state+0x2e0/0x2e0 ? __kasan_slab_free+0x1d1/0x230 ? kmem_cache_free+0x70/0x140 ? sk_psock_backlog+0x40c/0x4b0 ? process_one_work+0x40b/0x660 ? worker_thread+0x82/0x680 ? kthread+0x1b9/0x1e0 ? ret_from_fork+0x1f/0x30 ? check_preempt_curr+0xaf/0x130 ? iov_iter_kvec+0x5f/0x70 ? kernel_sendmsg_locked+0xa0/0xe0 skb_send_sock_locked+0x273/0x3c0 ? skb_splice_bits+0x180/0x180 ? start_thread+0xe0/0xe0 ? update_min_vruntime.constprop.27+0x88/0xc0 sk_psock_backlog+0xb3/0x4b0 ? strscpy+0xbf/0x1e0 process_one_work+0x40b/0x660 worker_thread+0x82/0x680 ? process_one_work+0x660/0x660 kthread+0x1b9/0x1e0 ? __kthread_create_on_node+0x250/0x250 ret_from_fork+0x1f/0x30 Fixes: 20bf50de3028c ("skbuff: Function to send an skbuf on a socket") Reported-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Signed-off-by: John Fastabend --- net/core/skbuff.c |1 + 1 file changed, 1 insertion(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e89be62..c3b03c5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2337,6 +2337,7 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); + msg.flags = MSG_DONTWAIT; ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); if (ret <= 0)
Re: [PATCH 7/8] dt-bindings: net: sun7i-gmac: Convert the binding to a schemas
On Thu, May 23, 2019 at 4:57 AM Maxime Ripard wrote: > > Switch our Allwinner A20 GMAC controller binding to a YAML schema to enable > the DT validation. Since that controller is based on a Synopsys IP, let's > add the validation to that schemas with a bunch of conditionals. > > Signed-off-by: Maxime Ripard > --- > Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt | 27 > --- > Documentation/devicetree/bindings/net/snps,dwmac.yaml | 45 > + > 2 files changed, 45 insertions(+), 27 deletions(-) > delete mode 100644 > Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt I think it would be better to keep these as separate files and either include snps,dwmac.yaml from it or only add the compatible to snps,dwmac.yaml. Rob
Re: [PATCH net] sk_msg: Keep reference on socket file while psock lives
[...] > > Thanks for taking a look at it. Setting MSG_DONTWAIT works great for > me. No more crashes in sk_stream_wait_memory. I've tested it on top of > current bpf-next (f49aa1de9836). Here's my: > > Tested-by: Jakub Sitnicki > > The actual I've tested is below, for completeness. > > BTW. I've ran into another crash which I haven't seen before while > testing sockmap-echo, but it looks unrelated: > > https://lore.kernel.org/netdev/20190522100142.28925-1-ja...@cloudflare.com/ > > -Jakub > > --- 8< --- > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index e89be6282693..4a7c656b195b 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -2337,6 +2337,7 @@ int skb_send_sock_locked(struct sock *sk, struct > sk_buff *skb, int offset, > kv.iov_base = skb->data + offset; > kv.iov_len = slen; > memset(&msg, 0, sizeof(msg)); > + msg.msg_flags = MSG_DONTWAIT; > > ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); > if (ret <= 0) I went ahead and submitted this feel free to add your signed-off-by. Thanks, John
Re: [PATCH bpf-next v2 1/3] bpf: implement bpf_send_signal() helper
On 5/23/19 8:41 AM, Daniel Borkmann wrote: > On 05/22/2019 07:39 AM, Yonghong Song wrote: >> This patch tries to solve the following specific use case. >> >> Currently, bpf program can already collect stack traces >> through kernel function get_perf_callchain() >> when certain events happens (e.g., cache miss counter or >> cpu clock counter overflows). But such stack traces are >> not enough for jitted programs, e.g., hhvm (jited php). >> To get real stack trace, jit engine internal data structures >> need to be traversed in order to get the real user functions. >> >> bpf program itself may not be the best place to traverse >> the jit engine as the traversing logic could be complex and >> it is not a stable interface either. >> >> Instead, hhvm implements a signal handler, >> e.g. for SIGALARM, and a set of program locations which >> it can dump stack traces. When it receives a signal, it will >> dump the stack in next such program location. >> >> Such a mechanism can be implemented in the following way: >>. a perf ring buffer is created between bpf program >> and tracing app. >>. once a particular event happens, bpf program writes >> to the ring buffer and the tracing app gets notified. >>. the tracing app sends a signal SIGALARM to the hhvm. >> >> But this method could have large delays and causing profiling >> results skewed. >> >> This patch implements bpf_send_signal() helper to send >> a signal to hhvm in real time, resulting in intended stack traces. >> >> Signed-off-by: Yonghong Song >> --- >> include/uapi/linux/bpf.h | 17 +- >> kernel/trace/bpf_trace.c | 67 >> 2 files changed, 83 insertions(+), 1 deletion(-) >> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 63e0cf66f01a..68d4470523a0 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -2672,6 +2672,20 @@ union bpf_attr { >>* 0 on success. >>* >>* **-ENOENT** if the bpf-local-storage cannot be found. >> + * >> + * int bpf_send_signal(u32 sig) >> + * Description >> + * Send signal *sig* to the current task. >> + * Return >> + * 0 on success or successfully queued. >> + * >> + * **-EBUSY** if work queue under nmi is full. >> + * >> + * **-EINVAL** if *sig* is invalid. >> + * >> + * **-EPERM** if no permission to send the *sig*. >> + * >> + * **-EAGAIN** if bpf program can try again. >>*/ >> #define __BPF_FUNC_MAPPER(FN) \ >> FN(unspec), \ >> @@ -2782,7 +2796,8 @@ union bpf_attr { >> FN(strtol), \ >> FN(strtoul),\ >> FN(sk_storage_get), \ >> -FN(sk_storage_delete), >> +FN(sk_storage_delete), \ >> +FN(send_signal), >> >> /* integer value in 'imm' field of BPF_CALL instruction selects which >> helper >>* function eBPF program intends to call >> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c >> index f92d6ad5e080..f8cd0db7289f 100644 >> --- a/kernel/trace/bpf_trace.c >> +++ b/kernel/trace/bpf_trace.c >> @@ -567,6 +567,58 @@ static const struct bpf_func_proto >> bpf_probe_read_str_proto = { >> .arg3_type = ARG_ANYTHING, >> }; >> >> +struct send_signal_irq_work { >> +struct irq_work irq_work; >> +u32 sig; >> +}; >> + >> +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); >> + >> +static void do_bpf_send_signal(struct irq_work *entry) >> +{ >> +struct send_signal_irq_work *work; >> + >> +work = container_of(entry, struct send_signal_irq_work, irq_work); >> +group_send_sig_info(work->sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); >> +} >> + >> +BPF_CALL_1(bpf_send_signal, u32, sig) >> +{ >> +struct send_signal_irq_work *work = NULL; >> + >> +/* Similar to bpf_probe_write_user, task needs to be >> + * in a sound condition and kernel memory access be >> + * permitted in order to send signal to the current >> + * task. >> + */ >> +if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) >> +return -EPERM; >> +if (unlikely(uaccess_kernel())) >> +return -EPERM; >> +if (unlikely(!nmi_uaccess_okay())) >> +return -EPERM; >> + >> +if (in_nmi()) { > > Hm, bit confused, can't this only be done out of process context in > general since only there current points to e.g. hhvm? I'm probably > missing something. Could you elaborate? That is true. If in nmi, it is out of process context and in nmi context, we use an irq_work here since group_send_sig_info() has spinlock inside. The bpf program (e.g., a perf_event program) needs to check it is with right current (e.g., by pid) before calling this helper. Does this address your question? > >> +work = this_cpu_ptr(&send_signal_work); >> +if (work->irq_work.flags & IRQ_WORK_BUSY) >> +
Re: [PATCH bpf-next 06/12] selftests/bpf: add tests for libbpf's hashmap
On 05/22, Andrii Nakryiko wrote: > On Wed, May 22, 2019 at 3:15 PM Andrii Nakryiko > wrote: > > > > On Wed, May 22, 2019 at 1:31 PM Stanislav Fomichev wrote: > > > > > > On 05/22, Andrii Nakryiko wrote: > > > > Test all APIs for internal hashmap implementation. > > > > > > > > Signed-off-by: Andrii Nakryiko > > > > --- > > > > tools/testing/selftests/bpf/.gitignore | 1 + > > > > tools/testing/selftests/bpf/Makefile | 2 +- > > > > tools/testing/selftests/bpf/test_hashmap.c | 382 + > > > > 3 files changed, 384 insertions(+), 1 deletion(-) > > > > create mode 100644 tools/testing/selftests/bpf/test_hashmap.c > > > > > > > > diff --git a/tools/testing/selftests/bpf/.gitignore > > > > b/tools/testing/selftests/bpf/.gitignore > > > > index dd5d69529382..138b6c063916 100644 > > > > --- a/tools/testing/selftests/bpf/.gitignore > > > > +++ b/tools/testing/selftests/bpf/.gitignore > > > > @@ -35,3 +35,4 @@ test_sysctl > > > > alu32 > > > > libbpf.pc > > > > libbpf.so.* > > > > +test_hashmap > > > > diff --git a/tools/testing/selftests/bpf/Makefile > > > > b/tools/testing/selftests/bpf/Makefile > > > > index 66f2dca1dee1..ddae06498a00 100644 > > > > --- a/tools/testing/selftests/bpf/Makefile > > > > +++ b/tools/testing/selftests/bpf/Makefile > > > > @@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps > > > > test_lru_map test_lpm_map test > > > > test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ > > > > test_sock test_btf test_sockmap test_lirc_mode2_user > > > > get_cgroup_id_user \ > > > > test_socket_cookie test_cgroup_storage test_select_reuseport > > > > test_section_names \ > > > > - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl > > > > + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl > > > > test_hashmap > > > > > > > > BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) > > > > TEST_GEN_FILES = $(BPF_OBJ_FILES) > > > > diff --git a/tools/testing/selftests/bpf/test_hashmap.c > > > > b/tools/testing/selftests/bpf/test_hashmap.c > > > > new file mode 100644 > > > > index ..b64094c981e3 > > > > --- /dev/null > > > [..] > > > > +++ b/tools/testing/selftests/bpf/test_hashmap.c > > > Any reason against putting it in bpf/prog_tests? No need for your own > > > CHECK macro and no Makefile changes. > > > > I didn't know they are special and I assumed they are mostly for > > loading BPF programs and testing them. I'll check that set up and will > > move there, if it makes sense. > > Seems like my assumption was right. I converted these hashmap tests to > progs_test, and each CHECK usage emits either SUCCESS or FAIL message, > spamming output like this: > > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > test_hashmap_generic:PASS:test_hashmap_generic 0 nsec > > > CHECK also expects __u32 duration to be in the scope, so I need to add > it just to make macro happy. > > It feels like at the moment this "framework" is not yet well-suited > for generic tests, but more towards loading BPF program and checking > (once per BPF program) that everything went fine. I'll keep it as is > for now. It won't be hard to convert it into prog_tests, once we > develop it a bit more. Oh, yeah, it's mostly for the tests which do bpf_prog_test_run. Might not be a good fit if you're not loading any bpf progs, agreed. > > > > @@ -0,0 +1,382 @@ > > > > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) > > > > + > > > > +/* > > > > + * Tests for libbpf's hashmap. > > > > + * > > > > + * Copyright (c) 2019 Facebook > > > > + */ > > > > +#include > > > > +#include > > > > +#include > > > > +#include "hashmap.h" > > > > + > > > > +#define CHECK(condition, format...) ({ > > > > \ > > > > + int __ret = !!(condition); \ > > > > + if (__ret) {\ > > > > + fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \ > > > > + fprintf(stderr, format);\ > > > > + } \ > > > > + __ret; \ > > > > +}) > > > > + > > > > +size_t hash_fn(const void *k, void *ctx) > > > > +{ > > > > + return (long)k; > > > > +} > > > > + > > > > +bool equal_fn(const void *a, const void *b, void *ctx) > > > > +{ > > > > + return (long)a ==
Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor
On Thu, 23 May 2019 at 16:31, Jiong Wang wrote: > > > > On 23 May 2019, at 15:02, Daniel Borkmann wrote: > > > > On 05/23/2019 08:38 AM, Y Song wrote: > >> On Wed, May 22, 2019 at 1:46 PM Björn Töpel wrote: > >>> On Wed, 22 May 2019 at 20:13, Y Song wrote: > On Wed, May 22, 2019 at 2:25 AM Björn Töpel > wrote: > > > > Add three tests to test_verifier/basic_instr that make sure that the > > high 32-bits of the destination register is cleared after an ALU32 > > and/or/xor. > > > > Signed-off-by: Björn Töpel > > I think the patch intends for bpf-next, right? The patch itself looks > good to me. > Acked-by: Yonghong Song > >>> > >>> Thank you. Actually, it was intended for the bpf tree, as a test > >>> follow up for this [1] fix. > >> Then maybe you want to add a Fixes tag and resubmit? > > > > Why would the test case need a fixes tag? It's common practice that we have > > BPF fixes that we queue to bpf tree along with kselftest test cases related > > to them. Therefore, applied as well, thanks for following up! > > > > Björn, in my email from the fix, I mentioned we should have test snippets > > ideally for all of the alu32 insns to not miss something falling through the > > cracks when JITs get added or changed. If you have some cycles to add the > > remaining missing ones, that would be much appreciated. > > Björn, > > If you don’t have time, I can take this alu32 test case follow up as well. > Jiong, that would be great. Thank you. I'd guess it would take much longer for me to do it (gmail.com time vs intel.com time ;-)). Björn > Regards, > Jiong > > > > > Thanks, > > Daniel >
[PATCH net-next v4] net: sched: Introduce act_ctinfo action
ctinfo is a new tc filter action module. It is designed to restore information contained in conntrack marks to other places. At present it can restore DSCP values to IPv4/6 diffserv fields and also copy conntrack marks to skb marks. As such the 2nd function effectively replaces the existing act_connmark module The DSCP restoration is intended for use and has been found useful for restoring ingress classifications based on egress classifications across links that bleach or otherwise change DSCP, typically home ISP Internet links. Restoring DSCP on ingress on the WAN link allows qdiscs such as CAKE to shape inbound packets according to policies that are easier to indicate on egress. Ingress classification is traditionally a challenging task since iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT lookups, hence are unable to see internal IPv4 addresses as used on the typical home masquerading gateway. ctinfo understands the following parameters: dscp dscpmask[/statemask] dscpmask - a 32 bit mask of at least 6 contiguous bits and indicates where ctinfo will find the DSCP bits stored in the conntrack mark. statemask - a 32 bit mask of (usually) 1 bit length, outside the area specified by dscpmask. This represents a conditional operation flag whereby the DSCP is only restored if the flag is set. This is useful to implement a 'one shot' iptables based classification where the 'complicated' iptables rules are only run once to classify the connection on initial (egress) packet and subsequent packets are all marked/restored with the same DSCP. A mask of zero disables the conditional behaviour ie. the conntrack mark DSCP bits are always restored to the ip diffserv field (assuming the conntrack entry is found & the skb is an ipv4/ipv6 type) mark [mask] mark - enables copying the conntrack connmark value to the skb mark mask - a 32 bit mask applied to the mark to mask out bit unwanted for restoration. The CAKE qdisc for example understands both DSCP and 'tin' classification stored the mark, thus act_ctinfo may be used to restore both aspects of classification for CAKE in one action. A default mask of 0x is applied if not specified. zone - conntrack zone control - action related control (reclassify | pipe | drop | continue | ok | goto chain ) e.g. dscp 0xfc00/0x0100 |0xFCconntrack mark00---| | Bits 31-26 | bit 25 | bit24 |~~~ Bit 0| | DSCP | unused | flag |unused | |---0x01---00---| | | | | ---| Conditional flag v only restore if set |-ip diffserv-| | 6 bits | |-| e.g. mark 0x00ff |0x00conntrack markff---| | Bits 31-24 | | | DSCP & flag| | |---| | | v |skb mark---| | | | | |---| Signed-off-by: Kevin Darbyshire-Bryant --- v2 - add equivalent connmark functionality with an enhancement to accept a mask pass statistics for each sub-function as individual netlink attributes and stop (ab)using overlimits, drops update the testing config correctly v3 - fix a licensing silly & tidy up GPL boilerplate v4 - drop stray copy paste inline reverse christmas tree local vars include/net/tc_act/tc_ctinfo.h| 28 ++ include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_ctinfo.h | 43 +++ net/sched/Kconfig | 17 + net/sched/Makefile| 1 + net/sched/act_ctinfo.c| 402 ++ tools/testing/selftests/tc-testing/config | 1 + 7 files changed, 493 insertions(+) create mode 100644 include/net/tc_act/tc_ctinfo.h create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h create mode 100644 net/sched/act_ctinfo.c diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h new file mode 100644 index ..87334120dcb6 --- /dev/null +++ b/include/net/tc_act/tc_ctinfo.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_CTINFO_H +#define __NET_TC_CTINFO_H + +#include + +struct tcf_ctinfo_params { + struct net *net; + u32 dscpmask; + u32 dscpstatemask; + u32 markmask; + u16 zone; + u8 mode; + u8 dscpmaskshift; + struct rcu_head rcu; +}; + +struct tcf_ctinfo { + struct tc_action common; + struct tcf_ctinfo_params __rcu *params; + u64 stats_dscp_set; + u64 stats_dscp_error; + u64 stats_mark_set; +}; + +#define to_ctinfo(a) ((struct tcf_ctinfo *)a) + +#endif /* __NET_TC_CTINFO_H */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt
Re: [RESEND][PATCH] Fix MACsec kernel panics, oopses and bugs
From: Andreas Steinmetz Date: Thu, 23 May 2019 09:46:15 +0200 > MACsec causes oopses followed by a kernel panic when attached directly or > indirectly to a bridge. It causes erroneous > checksum messages when attached to vxlan. When I did investigate I did find > skb leaks, apparent skb mis-handling and > superfluous code. The attached patch fixes all MACsec misbehaviour I could > find. As I am no kernel developer somebody > with sufficient kernel network knowledge should verify and correct the patch > where necessary. > > Signed-off-by: Andreas Steinmetz Subject lines should be of the form: [PATCH $DST_TREE] $subsystem_prefix: Description. Where $DST_TREE here would be "net" and $subsystem_prefix would be "macsec". > + /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */ > + skb->csum_complete_sw = 1; Create a helper for this in linux/skbuff.h with very clear and clean comments explaining what is going on.
Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics
On Thu, 23 May 2019 09:19:49 -0400, Jamal Hadi Salim wrote: > On 2019-05-22 6:20 p.m., Jakub Kicinski wrote: > > On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote: > >> * removed RFC tags > > > > Why? There is still no upstream user for this (my previous > > objections of this being only partially correct aside). > > > IIRC your point was to get the dumping to work with RTM_GETACTION. Yup. > That would still work here, no? There will be some latency > based on the frequency of hardware->kernel stats updates. I don't think so, I think the stats are only updated on classifier dumps in Ed's code. But we can't be 100% sure without seeing driver code.
Re: [patch net-next v2] devlink: add warning in case driver does not set port type
From: Jiri Pirko Date: Thu, 23 May 2019 10:43:35 +0200 > From: Jiri Pirko > > Prevent misbehavior of drivers who would not set port type for longer > period of time. Drivers should always set port type. Do WARN if that > happens. > > Note that it is perfectly fine to temporarily not have the type set, > during initialization and port type change. > > Signed-off-by: Jiri Pirko > --- > v1->v2: > - Don't warn on DSA and CPU ports. Applied.
Re: [PATCH v7 bpf-next 01/16] bpf: verifier: mark verified-insn with sub-register zext flag
On Thu, May 23, 2019 at 03:28:15PM +0100, Jiong Wang wrote: > > > On 23 May 2019, at 03:07, Alexei Starovoitov > > wrote: > > > > On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote: > >> eBPF ISA specification requires high 32-bit cleared when low 32-bit > >> sub-register is written. This applies to destination register of ALU32 etc. > >> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and > >> AArch64 ISA has the same semantics, so the corresponding JIT back-end > >> doesn't need to do extra work. > >> > >> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches > >> (PowerPC, SPARC etc) need to do explicit zero extension to meet this > >> requirement, otherwise code like the following will fail. > >> > >> u64_value = (u64) u32_value > >> ... other uses of u64_value > >> > >> This is because compiler could exploit the semantic described above and > >> save those zero extensions for extending u32_value to u64_value, these JIT > >> back-ends are expected to guarantee this through inserting extra zero > >> extensions which however could be a significant increase on the code size. > >> Some benchmarks show there could be ~40% sub-register writes out of total > >> insns, meaning at least ~40% extra code-gen. > >> > >> One observation is these extra zero extensions are not always necessary. > >> Take above code snippet for example, it is possible u32_value will never be > >> casted into a u64, the value of high 32-bit of u32_value then could be > >> ignored and extra zero extension could be eliminated. > >> > >> This patch implements this idea, insns defining sub-registers will be > >> marked when the high 32-bit of the defined sub-register matters. For > >> those unmarked insns, it is safe to eliminate high 32-bit clearnace for > >> them. > >> > >> Algo: > >> - Split read flags into READ32 and READ64. > >> > >> - Record index of insn that does sub-register write. Keep the index inside > >> reg state and update it during verifier insn walking. > >> > >> - A full register read on a sub-register marks its definition insn as > >> needing zero extension on dst register. > >> > >> A new sub-register write overrides the old one. > >> > >> - When propagating read64 during path pruning, also mark any insn defining > >> a sub-register that is read in the pruned path as full-register. > >> > >> Reviewed-by: Jakub Kicinski > >> Signed-off-by: Jiong Wang > >> --- > >> include/linux/bpf_verifier.h | 14 +++- > >> kernel/bpf/verifier.c| 175 > >> +++ > >> 2 files changed, 173 insertions(+), 16 deletions(-) > >> > >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h > >> index 1305ccb..60fb54e 100644 > >> --- a/include/linux/bpf_verifier.h > >> +++ b/include/linux/bpf_verifier.h > >> @@ -36,9 +36,11 @@ > >> */ > >> enum bpf_reg_liveness { > >>REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ > >> - REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ > >> - REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ > >> - REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore > >> */ > >> + REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial > >> value */ > >> + REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ > >> + REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, > >> + REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later > >> reads */ > >> + REG_LIVE_DONE = 0x8, /* liveness won't be updating this register > >> anymore */ > >> }; > >> > >> struct bpf_reg_state { > >> @@ -131,6 +133,11 @@ struct bpf_reg_state { > >> * pointing to bpf_func_state. > >> */ > >>u32 frameno; > >> + /* Tracks subreg definition. The stored value is the insn_idx of the > >> + * writing insn. This is safe because subreg_def is used before any insn > >> + * patching which only happens after main verification finished. > >> + */ > >> + s32 subreg_def; > >>enum bpf_reg_liveness live; > >> }; > >> > >> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data { > >>int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ > >>int sanitize_stack_off; /* stack slot to be cleared */ > >>bool seen; /* this insn was processed by the verifier */ > >> + bool zext_dst; /* this insn zero extends dst reg */ > >>u8 alu_state; /* used in combination with alu_limit */ > >>unsigned int orig_idx; /* original instruction index */ > >> }; > >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > >> index 95f93544..0efccf8 100644 > >> --- a/kernel/bpf/verifier.c > >> +++ b/kernel/bpf/verifier.c > >> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env > >> *env, > >>__mark_reg_not_init(regs + regno); > >> } > >> > >> +#define DEF_NOT_SUBREG(-1) > >> static void init_reg_state(struct bpf_verifier_env *env,
Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names
On Thu, 23 May 2019 09:41:59 -0600 David Ahern wrote: > I have been using the namespace override for a while now. I did consider > impacts to the above, but my thinking is this: exceptions are per FIB > entry (per fib6_nh with my latest patch set, but point still holds), FIB > entries are per FIB table, FIB tables are per network namespace. Running > multiple pmtu.sh sessions in parallel can not trigger an interdependent > bug because of that separation. The cleanup within a namespace teardown > (reference count leaks) should not be affected. I see, I guess it makes sense. > Now that we have good set of functional tests, we do need more complex > tests but those will still be contained within the namespace separation. > If you look at my current patch set on the list I add an icmp_redirect > test script. It actually does redirect, verify, mtu on top of redirect, > verify and then resets and inverts the order - going after an exception > entry with an update for both use cases. > > For the pmtu script, perhaps the next step is something as simple as > configuring the setup and routing once and then run all of the > individual tests (or multiple of them) to generate multiple exceptions > within a single FIB table and then tests to generate multiple exceptions > with different addresses per entry. I think, especially given your new icmp_redirect test script, that another sensible next step would be turning the setup part in pmtu.sh into some kind of library (also including the VRF setup) that could be sourced from both scripts. Right now, that looks like a lot of duplication. -- Stefano
Re: [PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level
On 5/23/19 3:54 AM, Quentin Monnet wrote: > libbpf was recently made aware of the log_level attribute for programs, > used to specify the level of information expected to be dumped by the > verifier. > > Create an API function to pass additional attributes when loading a > bpf_object, so we can set this log_level value in programs when loading > them, and so that so that applications relying on libbpf but not calling "so that so that" => "so that" > bpf_prog_load_xattr() can also use that feature. Do not fully understand the above statement. From the code below, I did not see how the non-zero log_level can be set for bpf_program without bpf_prog_load_xattr(). Maybe I miss something? > > v2: > - We are in a new cycle, bump libbpf extraversion number. > > Signed-off-by: Quentin Monnet > Reviewed-by: Jakub Kicinski > --- > tools/lib/bpf/Makefile | 2 +- > tools/lib/bpf/libbpf.c | 20 +--- > tools/lib/bpf/libbpf.h | 6 ++ > tools/lib/bpf/libbpf.map | 5 + > 4 files changed, 29 insertions(+), 4 deletions(-) > > diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile > index a2aceadf68db..9312066a1ae3 100644 > --- a/tools/lib/bpf/Makefile > +++ b/tools/lib/bpf/Makefile > @@ -3,7 +3,7 @@ > > BPF_VERSION = 0 > BPF_PATCHLEVEL = 0 > -BPF_EXTRAVERSION = 3 > +BPF_EXTRAVERSION = 4 > > MAKEFLAGS += --no-print-directory > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c > index 197b574406b3..1c6fb7a3201e 100644 > --- a/tools/lib/bpf/libbpf.c > +++ b/tools/lib/bpf/libbpf.c > @@ -,7 +,7 @@ static bool bpf_program__is_function_storage(struct > bpf_program *prog, > } > > static int > -bpf_object__load_progs(struct bpf_object *obj) > +bpf_object__load_progs(struct bpf_object *obj, int log_level) > { > size_t i; > int err; > @@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj) > for (i = 0; i < obj->nr_programs; i++) { > if (bpf_program__is_function_storage(&obj->programs[i], obj)) > continue; > + obj->programs[i].log_level = log_level; > err = bpf_program__load(&obj->programs[i], > obj->license, > obj->kern_version); > @@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj) > return 0; > } > > -int bpf_object__load(struct bpf_object *obj) > +int bpf_object__load_xattr(struct bpf_object_load_attr *attr) > { > + struct bpf_object *obj; > int err; > > + if (!attr) > + return -EINVAL; > + obj = attr->obj; > if (!obj) > return -EINVAL; > > @@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj) > > CHECK_ERR(bpf_object__create_maps(obj), err, out); > CHECK_ERR(bpf_object__relocate(obj), err, out); > - CHECK_ERR(bpf_object__load_progs(obj), err, out); > + CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out); > > return 0; > out: > @@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj) > return err; > } > > +int bpf_object__load(struct bpf_object *obj) > +{ > + struct bpf_object_load_attr attr = { > + .obj = obj, > + }; > + > + return bpf_object__load_xattr(&attr); > +} > + > static int check_path(const char *path) > { > char *cp, errmsg[STRERR_BUFSIZE]; > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h > index c5ff00515ce7..e1c748db44f6 100644 > --- a/tools/lib/bpf/libbpf.h > +++ b/tools/lib/bpf/libbpf.h > @@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct > bpf_object *obj, > LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); > LIBBPF_API void bpf_object__close(struct bpf_object *object); > > +struct bpf_object_load_attr { > + struct bpf_object *obj; > + int log_level; > +}; > + > /* Load/unload object into/from kernel */ > LIBBPF_API int bpf_object__load(struct bpf_object *obj); > +LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); > LIBBPF_API int bpf_object__unload(struct bpf_object *obj); > LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); > LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); > diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map > index 673001787cba..6ce61fa0baf3 100644 > --- a/tools/lib/bpf/libbpf.map > +++ b/tools/lib/bpf/libbpf.map > @@ -164,3 +164,8 @@ LIBBPF_0.0.3 { > bpf_map_freeze; > btf__finalize_data; > } LIBBPF_0.0.2; > + > +LIBBPF_0.0.4 { > + global: > + bpf_object__load_xattr; > +} LIBBPF_0.0.3; >
Re: [PATCH bpf-next v2 1/3] tools: bpftool: add -d option to get debug output from libbpf
On Thu, May 23, 2019 at 3:54 AM Quentin Monnet wrote: > > libbpf has three levels of priority for output messages: warn, info, > debug. By default, debug output is not printed to the console. > > Add a new "--debug" (short name: "-d") option to bpftool to print libbpf > logs for all three levels. > > Internally, we simply use the function provided by libbpf to replace the > default printing function by one that prints logs regardless of their > level. > > v2: > - Remove the possibility to select the log-levels to use (v1 offered a > combination of "warn", "info" and "debug"). > - Rename option and offer a short name: -d|--debug. Such and option in CLI tools is usually called -v|--verbose, I'm wondering if it might be a better name choice? Btw, some tools also use -v, -vv and -vvv to define different levels of verbosity, which is something we can consider in the future, as it's backwards compatible. > - Add option description to all bpftool manual pages (instead of > bpftool-prog.rst only), as all commands use libbpf. > > Signed-off-by: Quentin Monnet > Reviewed-by: Jakub Kicinski > --- > tools/bpf/bpftool/Documentation/bpftool-btf.rst| 4 > tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 4 > .../bpf/bpftool/Documentation/bpftool-feature.rst | 4 > tools/bpf/bpftool/Documentation/bpftool-map.rst| 4 > tools/bpf/bpftool/Documentation/bpftool-net.rst| 4 > tools/bpf/bpftool/Documentation/bpftool-perf.rst | 4 > tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 > tools/bpf/bpftool/Documentation/bpftool.rst| 3 +++ > tools/bpf/bpftool/bash-completion/bpftool | 2 +- > tools/bpf/bpftool/main.c | 14 +- > 10 files changed, 45 insertions(+), 2 deletions(-) > > diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst > b/tools/bpf/bpftool/Documentation/bpftool-btf.rst > index 2dbc1413fabd..00668df1bf7a 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst > @@ -67,6 +67,10 @@ OPTIONS > -p, --pretty > Generate human-readable JSON output. Implies **-j**. > > + -d, --debug > + Print all logs available from libbpf, including debug-level > + information. > + > EXAMPLES > > **# bpftool btf dump id 1226** > diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst > b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst > index ac26876389c2..36807735e2a5 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst > @@ -113,6 +113,10 @@ OPTIONS > -f, --bpffs > Show file names of pinned programs. > > + -d, --debug > + Print all logs available from libbpf, including debug-level > + information. > + > EXAMPLES > > | > diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst > b/tools/bpf/bpftool/Documentation/bpftool-feature.rst > index 14180e887082..4d08f35034a2 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst > @@ -73,6 +73,10 @@ OPTIONS > -p, --pretty > Generate human-readable JSON output. Implies **-j**. > > + -d, --debug > + Print all logs available from libbpf, including debug-level > + information. > + > SEE ALSO > > **bpf**\ (2), > diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst > b/tools/bpf/bpftool/Documentation/bpftool-map.rst > index 13ef27b39f20..490b4501cb6e 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst > @@ -152,6 +152,10 @@ OPTIONS > Do not automatically attempt to mount any virtual file > system > (such as tracefs or BPF virtual file system) when necessary. > > + -d, --debug > + Print all logs available from libbpf, including debug-level > + information. > + > EXAMPLES > > **# bpftool map show** > diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst > b/tools/bpf/bpftool/Documentation/bpftool-net.rst > index 934580850f42..d8e5237a2085 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst > @@ -65,6 +65,10 @@ OPTIONS > -p, --pretty > Generate human-readable JSON output. Implies **-j**. > > + -d, --debug > + Print all logs available from libbpf, including debug-level > + information. > + > EXAMPLES > > > diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst > b/tools/bpf/bpftool/Documentation/bpftool-perf.rst > index 0c7576523a21..e252bd0bc434 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-
Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics
On 22/05/2019 23:20, Jakub Kicinski wrote: > On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote: >> * removed RFC tags > Why? There is still no upstream user for this Well, patch #2 updates drivers to the changed API, which is kinda an "upstream user" if you squint... admittedly patch #1 is a bit dubious in that regard; I was kinda hoping someone on one of the existing drivers would either drop in a patch or at least point me at the info needed to write one for their HW, but no luck :-( My defence re patch #1 is that I'm not really adding a 'new feature' to the kernel, just restoring something that used to be there. It's a grey area and I'm waiting for the maintainer(s) to say yea or nay (Jiri noped v1 but that had a much bigger unused interface (the TC_CLSFLOWER_STATS_BYINDEX callback) so I'm still hoping). > (my previous objections of this being only partially correct aside). I didn't realise you still had outstanding objections, sorry. Regarding RTM_GETACTION dumping, it should be possible to add an offload callback based on this, which would just pass the action cookie to the driver. Then the driver just has to look the cookie up in its stats tables to find the counter. That would deal with the 'stale stats' issue. But right now, that really _would_ be an API without a user; still, I might be able to do that as an RFC patch to prove it's possible, would that help this series to go in as-is? -Ed
Re: [PATCH bpf-next 10/12] bpftool: add C output format option to btf dump subcommand
On Wed, 22 May 2019 21:43:43 -0700, Andrii Nakryiko wrote: > On Wed, May 22, 2019 at 6:23 PM Jakub Kicinski wrote: > > On Wed, 22 May 2019 17:58:23 -0700, Andrii Nakryiko wrote: > > > On Wed, May 22, 2019 at 5:25 PM Jakub Kicinski wrote: > > > > On Wed, 22 May 2019 12:50:51 -0700, Andrii Nakryiko wrote: > > > > > + * Copyright (C) 2019 Facebook > > > > > + */ > > > > > > > > > > #include > > > > > #include > > > > > @@ -340,11 +347,48 @@ static int dump_btf_raw(const struct btf *btf, > > > > > return 0; > > > > > } > > > > > > > > > > +static void btf_dump_printf(void *ctx, const char *fmt, va_list args) > > > > > +{ > > > > > + vfprintf(stdout, fmt, args); > > > > > +} > > > > > + > > > > > +static int dump_btf_c(const struct btf *btf, > > > > > + __u32 *root_type_ids, int root_type_cnt) > > > > > > > > Please break the line after static int. > > > > > > I don't mind, but it seems that prevalent formatting for such cases > > > (at least in bpftool code base) is aligning arguments and not break > > > static into separate line: > > > > > > // multi-line function definitions with static on the same line > > > $ rg '^static \w+.*\([^\)]*$' | wc -l > > > 45 > > > // multi-line function definitions with static on separate line > > > $ rg '^static \w+[^\(\{;]*$' | wc -l > > > 12 > > > > > > So I don't mind changing, but which one is canonical way of formatting? > > > > Not really, just my preference :) > > I'll stick to majority :) I feel like it's also a preferred style in > libbpf, so I'd rather converge to that. Majority is often wrong or at least lazy. But yeah, this is a waste of time. Do whatever. You also use inline keyword in C files in your libbpf patches.. I think kernel style rules should apply. > > In my experience having the return type on a separate line if its > > longer than a few chars is the simplest rule for consistent and good > > looking code. > > > > > > > + d = btf_dump__new(btf, NULL, NULL, btf_dump_printf); > > > > > + if (IS_ERR(d)) > > > > > + return PTR_ERR(d); > > > > > + > > > > > + if (root_type_cnt) { > > > > > + for (i = 0; i < root_type_cnt; i++) { > > > > > + err = btf_dump__dump_type(d, root_type_ids[i]); > > > > > + if (err) > > > > > + goto done; > > > > > + } > > > > > + } else { > > > > > + int cnt = btf__get_nr_types(btf); > > > > > + > > > > > + for (id = 1; id <= cnt; id++) { > > > > > + err = btf_dump__dump_type(d, id); > > > > > + if (err) > > > > > + goto done; > > > > > + } > > > > > + } > > > > > + > > > > > +done: > > > > > + btf_dump__free(d); > > > > > + return err; > > > > > > > > What do we do for JSON output? > > > > > > Still dump C syntax. What do you propose? Error out if json enabled? > > > > I wonder. Letting it just print C is going to confuse anything that > > just feeds the output into a JSON parser. I'd err on the side of > > returning an error, we can always relax that later if we find a use > > case of returning C syntax via JSON. > > Ok, I'll emit error (seems like pr_err automatically handles JSON > output, which is very nice). Thanks > > > > > if (!btf) { > > > > > err = btf__get_from_id(btf_id, &btf); > > > > > if (err) { > > > > > @@ -444,7 +498,10 @@ static int do_dump(int argc, char **argv) > > > > > } > > > > > } > > > > > > > > > > - dump_btf_raw(btf, root_type_ids, root_type_cnt); > > > > > + if (dump_c) > > > > > + dump_btf_c(btf, root_type_ids, root_type_cnt); > > > > > + else > > > > > + dump_btf_raw(btf, root_type_ids, root_type_cnt); > > > > > > > > > > done: > > > > > close(fd); > > > > > @@ -460,7 +517,7 @@ static int do_help(int argc, char **argv) > > > > > } > > > > > > > > > > fprintf(stderr, > > > > > - "Usage: %s btf dump BTF_SRC\n" > > > > > + "Usage: %s btf dump BTF_SRC [c]\n" > > > > > > > > bpftool generally uses formats. So perhaps we could do > > > > something like "[format raw|c]" here for consistency, defaulting to > > > > raw? > > > > > > That's not true for options, though. I see that at cgroup, prog, and > > > some map subcommands (haven't checked all other) just accept a list of > > > options without extra identifying key. > > > > Yeah, we weren't 100% enforcing this rule and it's a bit messy now :/ > > Unless you feel very strongly about this, it seems ok to me to allow > "boolean options" (similarly to boolean --flag args) as a stand-alone > set of tags. bpftool invocations are already very verbose, no need to > add to that. Plus it also makes bash-completion simpler, it's always > good not to complicate bash script unnecessarily :) It's more of a question if we're goin
Re: [PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level
On 5/23/19 9:19 AM, Yonghong Song wrote: > > > On 5/23/19 3:54 AM, Quentin Monnet wrote: >> libbpf was recently made aware of the log_level attribute for programs, >> used to specify the level of information expected to be dumped by the >> verifier. >> >> Create an API function to pass additional attributes when loading a >> bpf_object, so we can set this log_level value in programs when loading >> them, and so that so that applications relying on libbpf but not calling > "so that so that" => "so that" >> bpf_prog_load_xattr() can also use that feature. > > Do not fully understand the above statement. From the code below, > I did not see how the non-zero log_level can be set for bpf_program > without bpf_prog_load_xattr(). Maybe I miss something? Looks like next patch uses it when -d is specified. Probably commit message can be made more clear. > >> >> v2: >> - We are in a new cycle, bump libbpf extraversion number. >> >> Signed-off-by: Quentin Monnet >> Reviewed-by: Jakub Kicinski >> --- >> tools/lib/bpf/Makefile | 2 +- >> tools/lib/bpf/libbpf.c | 20 +--- >> tools/lib/bpf/libbpf.h | 6 ++ >> tools/lib/bpf/libbpf.map | 5 + >> 4 files changed, 29 insertions(+), 4 deletions(-) >> >> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile >> index a2aceadf68db..9312066a1ae3 100644 >> --- a/tools/lib/bpf/Makefile >> +++ b/tools/lib/bpf/Makefile >> @@ -3,7 +3,7 @@ >> BPF_VERSION = 0 >> BPF_PATCHLEVEL = 0 >> -BPF_EXTRAVERSION = 3 >> +BPF_EXTRAVERSION = 4 >> MAKEFLAGS += --no-print-directory >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c >> index 197b574406b3..1c6fb7a3201e 100644 >> --- a/tools/lib/bpf/libbpf.c >> +++ b/tools/lib/bpf/libbpf.c >> @@ -,7 +,7 @@ static bool >> bpf_program__is_function_storage(struct bpf_program *prog, >> } >> static int >> -bpf_object__load_progs(struct bpf_object *obj) >> +bpf_object__load_progs(struct bpf_object *obj, int log_level) >> { >> size_t i; >> int err; >> @@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj) >> for (i = 0; i < obj->nr_programs; i++) { >> if (bpf_program__is_function_storage(&obj->programs[i], obj)) >> continue; >> + obj->programs[i].log_level = log_level; >> err = bpf_program__load(&obj->programs[i], >> obj->license, >> obj->kern_version); >> @@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj) >> return 0; >> } >> -int bpf_object__load(struct bpf_object *obj) >> +int bpf_object__load_xattr(struct bpf_object_load_attr *attr) >> { >> + struct bpf_object *obj; >> int err; >> + if (!attr) >> + return -EINVAL; >> + obj = attr->obj; >> if (!obj) >> return -EINVAL; >> @@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj) >> CHECK_ERR(bpf_object__create_maps(obj), err, out); >> CHECK_ERR(bpf_object__relocate(obj), err, out); >> - CHECK_ERR(bpf_object__load_progs(obj), err, out); >> + CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out); >> return 0; >> out: >> @@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj) >> return err; >> } >> +int bpf_object__load(struct bpf_object *obj) >> +{ >> + struct bpf_object_load_attr attr = { >> + .obj = obj, >> + }; >> + >> + return bpf_object__load_xattr(&attr); >> +} >> + >> static int check_path(const char *path) >> { >> char *cp, errmsg[STRERR_BUFSIZE]; >> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h >> index c5ff00515ce7..e1c748db44f6 100644 >> --- a/tools/lib/bpf/libbpf.h >> +++ b/tools/lib/bpf/libbpf.h >> @@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct >> bpf_object *obj, >> LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char >> *path); >> LIBBPF_API void bpf_object__close(struct bpf_object *object); >> +struct bpf_object_load_attr { >> + struct bpf_object *obj; >> + int log_level; >> +}; >> + >> /* Load/unload object into/from kernel */ >> LIBBPF_API int bpf_object__load(struct bpf_object *obj); >> +LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr >> *attr); >> LIBBPF_API int bpf_object__unload(struct bpf_object *obj); >> LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); >> LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); >> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map >> index 673001787cba..6ce61fa0baf3 100644 >> --- a/tools/lib/bpf/libbpf.map >> +++ b/tools/lib/bpf/libbpf.map >> @@ -164,3 +164,8 @@ LIBBPF_0.0.3 { >> bpf_map_freeze; >> btf__finalize_data; >> } LIBBPF_0.0.2; >> + >> +LIBBPF_0.0.4 { >> + global: >> + bpf_object__load_xattr; >> +} LIBBPF_0.0.3; >>
Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics
On Thu, 23 May 2019 17:21:49 +0100, Edward Cree wrote: > On 22/05/2019 23:20, Jakub Kicinski wrote: > > On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote: > >> * removed RFC tags > > Why? There is still no upstream user for this > Well, patch #2 updates drivers to the changed API, which is kinda an > "upstream user" if you squint... admittedly patch #1 is a bit dubious > in that regard; Both 1 and 2 are dubious if you ask me. Complexity added for no upstream gain. 3 is a good patch, perhaps worth posting it separately rather than keeping it hostage to the rest of the series? :) Side note - it's not clear why open code the loop in every driver rather than having flow_stats_update() handle the looping?
Re: [PATCH net-next] cxgb4: use firmware API for validating filter spec
From: Raju Rangoju Date: Thu, 23 May 2019 19:21:21 +0530 > Adds support for validating hardware filter spec configured in firmware > before offloading exact match flows. > > Use the new fw api FW_PARAM_DEV_FILTER_MODE_MASK to read the filter mode > and mask from firmware. If the api isn't supported, then fall-back to > older way of reading just the mode from indirect register. > > Signed-off-by: Raju Rangoju Applied.
Re: [PATCH net-next] Revert "dpaa2-eth: configure the cache stashing amount on a queue"
From: Ioana Radulescu Date: Thu, 23 May 2019 17:38:22 +0300 > This reverts commit f8b995853444aba9c16c1ccdccdd397527fde96d. > > The reverted change instructed the QMan hardware block to fetch > RX frame annotation and beginning of frame data to cache before > the core would read them. > > It turns out that in rare cases, it's possible that a QMan > stashing transaction is delayed long enough such that, by the time > it gets executed, the frame in question had already been dequeued > by the core and software processing began on it. If the core > manages to unmap the frame buffer _before_ the stashing transaction > is executed, an SMMU exception will be raised. > > Unfortunately there is no easy way to work around this while keeping > the performance advantages brought by QMan stashing, so disable > it altogether. > > Signed-off-by: Ioana Radulescu Applied.
Re: [PATCH bpf-next v2 3/3] tools: bpftool: make -d option print debug output from verifier
On 5/23/19 3:54 AM, Quentin Monnet wrote: > The "-d" option is used to require all logs available for bpftool. So > far it meant telling libbpf to print even debug-level information. But > there is another source of info that can be made more verbose: when we > attemt to load programs with bpftool, we can pass a log_level parameter > to the verifier in order to control the amount of information that is > printed to the console. > > Reuse the "-d" option to print all information the verifier can tell. At > this time, this means logs related to BPF_LOG_LEVEL1, BPF_LOG_LEVEL2 and > BPF_LOG_STATS. As mentioned in the discussion on the first version of > this set, these macros are internal to the kernel > (include/linux/bpf_verifier.h) and are not meant to be part of the > stable user API, therefore we simply use the related constants to print > whatever we can at this time, without trying to tell users what is > log_level1 or what is statistics. > > Verifier logs are only used when loading programs for now (in the > future: for loading BTF objects with bpftool?), so bpftool.rst and > bpftool-prog.rst are the only man pages to get the update. The current BTF error log print out at warning level. So by default, it should print out error logs unless it is suppressed explicitly. int btf__load(struct btf *btf) { __u32 log_buf_size = BPF_LOG_BUF_SIZE; char *log_buf = NULL; int err = 0; if (btf->fd >= 0) return -EEXIST; log_buf = malloc(log_buf_size); if (!log_buf) return -ENOMEM; *log_buf = 0; btf->fd = bpf_load_btf(btf->data, btf->data_size, log_buf, log_buf_size, false); if (btf->fd < 0) { err = -errno; pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno); if (*log_buf) pr_warning("%s\n", log_buf); goto done; } done: free(log_buf); return err; } > > v2: > - Remove the possibility to select the log levels to use (v1 offered a >combination of "log_level1", "log_level2" and "stats"). > - The macros from kernel header bpf_verifier.h are not used (and >therefore not moved to UAPI header). > - In v1 this was a distinct option, but is now merged in the only "-d" >switch to activate libbpf and verifier debug-level logs all at the >same time. > > Signed-off-by: Quentin Monnet > Reviewed-by: Jakub Kicinski > --- > .../bpftool/Documentation/bpftool-prog.rst| 5 ++-- > tools/bpf/bpftool/Documentation/bpftool.rst | 5 ++-- > tools/bpf/bpftool/main.c | 2 ++ > tools/bpf/bpftool/main.h | 1 + > tools/bpf/bpftool/prog.c | 27 --- > 5 files changed, 26 insertions(+), 14 deletions(-) > > diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst > b/tools/bpf/bpftool/Documentation/bpftool-prog.rst > index 9a92614569e6..228a5c863cc7 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst > @@ -175,8 +175,9 @@ OPTIONS > (such as tracefs or BPF virtual file system) when necessary. > > -d, --debug > - Print all logs available from libbpf, including debug-level > - information. > + Print all logs available, even debug-level information. This > + includes logs from libbpf as well as from the verifier, when > + attempting to load programs. > > EXAMPLES > > diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst > b/tools/bpf/bpftool/Documentation/bpftool.rst > index 43dba0717953..6a9c52ef84a9 100644 > --- a/tools/bpf/bpftool/Documentation/bpftool.rst > +++ b/tools/bpf/bpftool/Documentation/bpftool.rst > @@ -67,8 +67,9 @@ OPTIONS > (such as tracefs or BPF virtual file system) when necessary. > > -d, --debug > - Print all logs available from libbpf, including debug-level > - information. > + Print all logs available, even debug-level information. This > + includes logs from libbpf as well as from the verifier, when > + attempting to load programs. > > SEE ALSO > > diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c > index d74293938a05..4879f6395c7e 100644 > --- a/tools/bpf/bpftool/main.c > +++ b/tools/bpf/bpftool/main.c > @@ -26,6 +26,7 @@ bool pretty_output; > bool json_output; > bool show_pinned; > bool block_mount; > +bool verifier_logs; > int bpf_flags; > struct pinned_obj_table prog_table; > struct pinned_obj_table map_table; > @@ -373,6 +374,7 @@ int main(int argc, char **argv) > break; > case 'd': > libbpf_set_print(print_all_levels); > + ver
Re: [PATCH net] cxgb4: offload VLAN flows regardless of VLAN ethtype
From: Raju Rangoju Date: Thu, 23 May 2019 20:41:44 +0530 > VLAN flows never get offloaded unless ivlan_vld is set in filter spec. > It's not compulsory for vlan_ethtype to be set. > > So, always enable ivlan_vld bit for offloading VLAN flows regardless of > vlan_ethtype is set or not. > > Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan) > Signed-off-by: Raju Rangoju Applied and queued up for -stable.
Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics
On 23/05/2019 17:11, Jakub Kicinski wrote: > On Thu, 23 May 2019 09:19:49 -0400, Jamal Hadi Salim wrote: >> That would still work here, no? There will be some latency >> based on the frequency of hardware->kernel stats updates. > I don't think so, I think the stats are only updated on classifier > dumps in Ed's code. Yep currently that's the case, but not as an inherent restriction (see my other mail). > But we can't be 100% sure without seeing driver code. Would it help if I posted my driver code to the list? It's gonna be upstream eventually anyway, it's just that the driver as a whole isn't really in a shape to be merged just yet (mainly 'cos the hardware folks are planning some breaking changes). But I can post my TC handling code, or even the whole driver, if demonstrating how these interfaces can be used will help matters. -Ed
[PATCH net-next] selftests/net: SO_TXTIME with ETF and FQ
The SO_TXTIME API enables packet tranmission with delayed delivery. This is currently supported by the ETF and FQ packet schedulers. Evaluate the interface with both schedulers. Install the scheduler and send a variety of packets streams: without delay, with one delayed packet, with multiple ordered delays and with reordering. Verify that packets are released by the scheduler in expected order. The ETF qdisc requires a timestamp in the future on every packet. It needs a delay on the qdisc else the packet is dropped on dequeue for having a delivery time in the past. The test value is experimentally derived. ETF requires clock_id CLOCK_TAI. It checks this base and drops for non-conformance. The FQ qdisc expects clock_id CLOCK_MONOTONIC, the base used by TCP as of commit fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC"). Within a flow there is an expecation of ordered delivery, as shown by delivery times of test 4. The FQ qdisc does not require all packets to have timestamps and does not drop for non-conformance. The large (msec) delays are chosen to avoid flakiness. Output: SO_TXTIME ipv6 clock monolithic payload:a delay:33 expected:0 (us) SO_TXTIME ipv4 clock monolithic payload:a delay:44 expected:0 (us) SO_TXTIME ipv6 clock monolithic payload:a delay:10049 expected:1 (us) SO_TXTIME ipv4 clock monolithic payload:a delay:10105 expected:1 (us) [.. etc ..] OK. All tests passed Signed-off-by: Willem de Bruijn --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/config | 2 + tools/testing/selftests/net/so_txtime.c | 297 +++ tools/testing/selftests/net/so_txtime.sh | 31 +++ 5 files changed, 333 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/so_txtime.c create mode 100755 tools/testing/selftests/net/so_txtime.sh diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 6f81130605d7d..27ef4d07ac915 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -17,3 +17,4 @@ tcp_inq tls txring_overwrite ip_defrag +so_txtime diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 1e6d14d2825cc..8af7869e0f1c8 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -9,12 +9,13 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh -TEST_PROGS += test_vxlan_fdb_changelink.sh +TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag +TEST_GEN_FILES += so_txtime TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 4740404486018..89f84b5118bfa 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -25,3 +25,5 @@ CONFIG_NF_TABLES_IPV6=y CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_CHAIN_NAT_IPV6=m CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_ETF=m diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c new file mode 100644 index 0..d5e9f6807be43 --- /dev/null +++ b/tools/testing/selftests/net/so_txtime.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test the SO_TXTIME API + * + * Takes two streams of { payload, delivery time }[], one input and one output. + * Sends the input stream and verifies arrival matches the output stream. + * The two streams can differ due to out-of-order delivery and drops. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int cfg_clockid = CLOCK_TAI; +static boolcfg_do_ipv4; +static boolcfg_do_ipv6; +static uint16_tcfg_port= 8000; +static int cfg_variance_us = 2000; + +static uint64_t glob_tstart; + +/* encode one timed transmission (of a 1B payload) */ +struct timed_send { + chardata; + int64_t delay_us; +}; + +#define MAX_NUM_PKT8 +static struct timed_send cfg_in[MAX_NUM_PKT]; +static struct timed_send cfg_out[MAX_NUM_PKT]; +static int cfg_num_pkt; + +static uint64_t gett