date:20190523

[PATCH v2] samples: bpf: fix style in bpf_load

2019-05-23 Thread Daniel T. Lee

This commit fixes style problem in samples/bpf/bpf_load.c

Styles that have been changed are:
 - Magic string use of 'DEBUGFS'
 - Useless zero initialization of a global variable
 - Minor style fix with whitespace

Signed-off-by: Daniel T. Lee 
---
Changes in v2:
  - Fix string concatenation from build-time to compile-time.

 samples/bpf/bpf_load.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index eae7b635343d..1734ade04f7f 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -40,7 +40,7 @@ int prog_cnt;
 int prog_array_fd = -1;
 
 struct bpf_map_data map_data[MAX_MAPS];
-int map_data_count = 0;
+int map_data_count;
 
 static int populate_prog_array(const char *event, int prog_fd)
 {
@@ -65,7 +65,7 @@ static int write_kprobe_events(const char *val)
else
flags = O_WRONLY | O_APPEND;
 
-   fd = open("/sys/kernel/debug/tracing/kprobe_events", flags);
+   fd = open(DEBUGFS "kprobe_events", flags);
 
ret = write(fd, val, strlen(val));
close(fd);
@@ -490,8 +490,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, 
int maps_shndx,
 
/* Verify no newer features were requested */
if (validate_zero) {
-   addr = (unsigned char*) def + map_sz_copy;
-   end  = (unsigned char*) def + map_sz_elf;
+   addr = (unsigned char *) def + map_sz_copy;
+   end  = (unsigned char *) def + map_sz_elf;
for (; addr < end; addr++) {
if (*addr != 0) {
free(sym);
-- 
2.17.1

[RESEND][PATCH] Fix MACsec kernel panics, oopses and bugs

2019-05-23 Thread Andreas Steinmetz

MACsec causes oopses followed by a kernel panic when attached directly or 
indirectly to a bridge. It causes erroneous
checksum messages when attached to vxlan. When I did investigate I did find skb 
leaks, apparent skb mis-handling and
superfluous code. The attached patch fixes all MACsec misbehaviour I could 
find. As I am no kernel developer somebody
with sufficient kernel network knowledge should verify and correct the patch 
where necessary.

Signed-off-by: Andreas Steinmetz 

--- linux.orig/drivers/net/macsec.c 2019-05-17 11:00:13.631121950 +0200
+++ linux/drivers/net/macsec.c  2019-05-17 18:41:41.333119772 +0200
@@ -911,6 +911,9 @@ static void macsec_decrypt_done(struct c
macsec_extra_len(macsec_skb_cb(skb)->has_sci));
macsec_reset_skb(skb, macsec->secy.netdev);
 
+   /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */
+   skb->csum_complete_sw = 1;
+
len = skb->len;
if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS)
count_rx(dev, len);
@@ -938,9 +941,6 @@ static struct sk_buff *macsec_decrypt(st
u16 icv_len = secy->icv_len;
 
macsec_skb_cb(skb)->valid = false;
-   skb = skb_share_check(skb, GFP_ATOMIC);
-   if (!skb)
-   return ERR_PTR(-ENOMEM);
 
ret = skb_cow_data(skb, 0, &trailer);
if (unlikely(ret < 0)) {
@@ -972,11 +972,6 @@ static struct sk_buff *macsec_decrypt(st
 
aead_request_set_crypt(req, sg, sg, len, iv);
aead_request_set_ad(req, 
macsec_hdr_len(macsec_skb_cb(skb)->has_sci));
-   skb = skb_unshare(skb, GFP_ATOMIC);
-   if (!skb) {
-   aead_request_free(req);
-   return ERR_PTR(-ENOMEM);
-   }
} else {
/* integrity only: all headers + data authenticated */
aead_request_set_crypt(req, sg, sg, icv_len, iv);
@@ -1102,20 +1097,12 @@ static rx_handler_result_t macsec_handle
return RX_HANDLER_PASS;
}
 
-   skb = skb_unshare(skb, GFP_ATOMIC);
-   if (!skb) {
-   *pskb = NULL;
-   return RX_HANDLER_CONSUMED;
-   }
-
pulled_sci = pskb_may_pull(skb, macsec_extra_len(true));
if (!pulled_sci) {
if (!pskb_may_pull(skb, macsec_extra_len(false)))
goto drop_direct;
}
 
-   hdr = macsec_ethhdr(skb);
-
/* Frames with a SecTAG that has the TCI E bit set but the C
 * bit clear are discarded, as this reserved encoding is used
 * to identify frames with a SecTAG that are not to be
@@ -1130,6 +1117,12 @@ static rx_handler_result_t macsec_handle
goto drop_direct;
}
 
+   skb = skb_unshare(skb, GFP_ATOMIC);
+   if (!skb)
+   return RX_HANDLER_CONSUMED;
+
+   hdr = macsec_ethhdr(skb);
+
/* ethernet header is part of crypto processing */
skb_push(skb, ETH_HLEN);
 
@@ -1213,22 +1206,22 @@ static rx_handler_result_t macsec_handle
 
/* Disabled && !changed text => skip validation */
if (hdr->tci_an & MACSEC_TCI_C ||
-   secy->validate_frames != MACSEC_VALIDATE_DISABLED)
+   secy->validate_frames != MACSEC_VALIDATE_DISABLED) {
skb = macsec_decrypt(skb, dev, rx_sa, sci, secy);
 
-   if (IS_ERR(skb)) {
-   /* the decrypt callback needs the reference */
-   if (PTR_ERR(skb) != -EINPROGRESS) {
-   macsec_rxsa_put(rx_sa);
-   macsec_rxsc_put(rx_sc);
+   if (IS_ERR(skb)) {
+   /* the decrypt callback needs the reference */
+   if (PTR_ERR(skb) != -EINPROGRESS) {
+   macsec_rxsa_put(rx_sa);
+   macsec_rxsc_put(rx_sc);
+   }
+   rcu_read_unlock();
+   return RX_HANDLER_CONSUMED;
}
-   rcu_read_unlock();
-   *pskb = NULL;
-   return RX_HANDLER_CONSUMED;
-   }
 
-   if (!macsec_post_decrypt(skb, secy, pn))
-   goto drop;
+   if (!macsec_post_decrypt(skb, secy, pn))
+   goto drop;
+   }
 
 deliver:
macsec_finalize_skb(skb, secy->icv_len,
@@ -1239,6 +1232,9 @@ deliver:
macsec_rxsa_put(rx_sa);
macsec_rxsc_put(rx_sc);
 
+   /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */
+   skb->csum_complete_sw = 1;
+
ret = gro_cells_receive(&macsec->gro_cells, skb);
if (ret == NET_RX_SUCCESS)
count_rx(dev, skb->len);
@@ -1247,7 +1243,6 @@ deliver:
 
rcu_read_unlock();
 
-   *pskb = NULL;
return RX_HANDLER_CONSUMED;
 
 drop:
@@ -1257,7 +1252,6 @@ drop_nosa:
rcu_read_unlock();
 drop_direct:
kfree_skb(skb);
-

Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor

2019-05-23 Thread Björn Töpel

On Thu, 23 May 2019 at 08:39, Y Song  wrote:
>
> On Wed, May 22, 2019 at 1:46 PM Björn Töpel  wrote:
> >
> > On Wed, 22 May 2019 at 20:13, Y Song  wrote:
> > >
> > > On Wed, May 22, 2019 at 2:25 AM Björn Töpel  wrote:
> > > >
> > > > Add three tests to test_verifier/basic_instr that make sure that the
> > > > high 32-bits of the destination register is cleared after an ALU32
> > > > and/or/xor.
> > > >
> > > > Signed-off-by: Björn Töpel 
> > >
> > > I think the patch intends for bpf-next, right? The patch itself looks
> > > good to me.
> > > Acked-by: Yonghong Song 
> > >
> >
> > Thank you. Actually, it was intended for the bpf tree, as a test
> > follow up for this [1] fix.
> Then maybe you want to add a Fixes tag and resubmit?

Hmm, I thought that adding tests were OK for non-next. Should the
Fixes: tag for the test reflex the corresponding fixed code (in this
case the RV JIT)?

> >
> >
> > Cheers,
> > Björn
> >
> > [1] 
> > https://lore.kernel.org/bpf/caj+hfnifkxkz8df7glbuqwa6+t6awrrrk6ow6m1nayetjd+...@mail.gmail.com/
> >
> > > > ---
> > > >  .../selftests/bpf/verifier/basic_instr.c  | 39 +++
> > > >  1 file changed, 39 insertions(+)
> > > >
> > > > diff --git a/tools/testing/selftests/bpf/verifier/basic_instr.c 
> > > > b/tools/testing/selftests/bpf/verifier/basic_instr.c
> > > > index ed91a7b9a456..4d844089938e 100644
> > > > --- a/tools/testing/selftests/bpf/verifier/basic_instr.c
> > > > +++ b/tools/testing/selftests/bpf/verifier/basic_instr.c
> > > > @@ -132,3 +132,42 @@
> > > > .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> > > > .result = ACCEPT,
> > > >  },
> > > > +{
> > > > +   "and32 reg zero extend check",
> > > > +   .insns = {
> > > > +   BPF_MOV64_IMM(BPF_REG_0, -1),
> > > > +   BPF_MOV64_IMM(BPF_REG_2, -2),
> > > > +   BPF_ALU32_REG(BPF_AND, BPF_REG_0, BPF_REG_2),
> > > > +   BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
> > > > +   BPF_EXIT_INSN(),
> > > > +   },
> > > > +   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> > > > +   .result = ACCEPT,
> > > > +   .retval = 0,
> > > > +},
> > > > +{
> > > > +   "or32 reg zero extend check",
> > > > +   .insns = {
> > > > +   BPF_MOV64_IMM(BPF_REG_0, -1),
> > > > +   BPF_MOV64_IMM(BPF_REG_2, -2),
> > > > +   BPF_ALU32_REG(BPF_OR, BPF_REG_0, BPF_REG_2),
> > > > +   BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
> > > > +   BPF_EXIT_INSN(),
> > > > +   },
> > > > +   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> > > > +   .result = ACCEPT,
> > > > +   .retval = 0,
> > > > +},
> > > > +{
> > > > +   "xor32 reg zero extend check",
> > > > +   .insns = {
> > > > +   BPF_MOV64_IMM(BPF_REG_0, -1),
> > > > +   BPF_MOV64_IMM(BPF_REG_2, 0),
> > > > +   BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_2),
> > > > +   BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
> > > > +   BPF_EXIT_INSN(),
> > > > +   },
> > > > +   .prog_type = BPF_PROG_TYPE_SCHED_CLS,
> > > > +   .result = ACCEPT,
> > > > +   .retval = 0,
> > > > +},
> > > > --
> > > > 2.20.1
> > > >

Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names

2019-05-23 Thread Stefano Brivio

Hi David,

On Wed, 22 May 2019 12:11:06 -0700
David Ahern  wrote:

> From: David Ahern 
> 
> The point of the pause-on-fail argument is to leave the setup as is after
> a test fails to allow a user to debug why it failed. Move the cleanup
> after posting the result to the user to make it so.
> 
> Random names for the namespaces are not user friendly when trying to
> debug a failure. Make them simpler and more direct for the tests. Run
> cleanup at the beginning to ensure they are cleaned up if they already
> exist.

The reasons for picking per-instance unique names were:

- one can run multiple instances of the script in parallel. I
  couldn't trigger any bug this way *so far*, though

- cleanup might fail because of e.g. device reference count leaks (this
  happened quite frequently in the past), which are anyway visible in
  kernel logs. Unique names avoid the need to reboot

Sure, it's a trade-off with usability, and I also see the value of
having fixed names, so I'm fine with this too. I just wanted to make
sure you considered these points.

By the way, the comment to nsname() (that I would keep, it's still
somewhat convenient) is now inconsistent.

-- 
Stefano

Re: [PATCH net,stable 1/1] net: fec: fix the clk mismatch in failed_reset path

2019-05-23 Thread Baruch Siach

Hi Andy,

On Thu, May 23, 2019 at 01:55:28AM +, Andy Duan wrote:
> Fix the clk mismatch in the error path "failed_reset" because
> below error path will disable clk_ahb and clk_ipg directly, it
> should use pm_runtime_put_noidle() instead of pm_runtime_put()
> to avoid to call runtime resume callback.
> 
> Reported-by: Baruch Siach 
> Signed-off-by: Fugang Duan 

Tested-by: Baruch Siach 

Tested on SolidRun Hummingboard Pulse.

Thanks.

But please avoid sending patched in base64 encoded emails. Plaintext is much 
easier when dealing with 'git am'.

baruch

> ---
>  drivers/net/ethernet/freescale/fec_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/freescale/fec_main.c 
> b/drivers/net/ethernet/freescale/fec_main.c
> index f63eb2b..848defa 100644
> --- a/drivers/net/ethernet/freescale/fec_main.c
> +++ b/drivers/net/ethernet/freescale/fec_main.c
> @@ -3555,7 +3555,7 @@ fec_probe(struct platform_device *pdev)
>   if (fep->reg_phy)
>   regulator_disable(fep->reg_phy);
>  failed_reset:
> - pm_runtime_put(&pdev->dev);
> + pm_runtime_put_noidle(&pdev->dev);
>   pm_runtime_disable(&pdev->dev);
>  failed_regulator:
>   clk_disable_unprepare(fep->clk_ahb);

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -

[patch net-next v2] devlink: add warning in case driver does not set port type

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Prevent misbehavior of drivers who would not set port type for longer
period of time. Drivers should always set port type. Do WARN if that
happens.

Note that it is perfectly fine to temporarily not have the type set,
during initialization and port type change.

Signed-off-by: Jiri Pirko 
---
v1->v2:
- Don't warn on DSA and CPU ports.
---
 include/net/devlink.h |  2 ++
 net/core/devlink.c| 38 ++
 2 files changed, 40 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1c4adfb4195a..151eb930d329 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -64,6 +65,7 @@ struct devlink_port {
enum devlink_port_type desired_type;
void *type_dev;
struct devlink_port_attrs attrs;
+   struct delayed_work type_warn_dw;
 };
 
 struct devlink_sb_pool_info {
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d43bc52b8840..9716a7f382cb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -5390,6 +5391,38 @@ void devlink_free(struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_free);
 
+static void devlink_port_type_warn(struct work_struct *work)
+{
+   WARN(true, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+   /* Ignore CPU and DSA flavours. */
+   return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+  devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+   if (!devlink_port_type_should_warn(devlink_port))
+   return;
+   /* Schedule a work to WARN in case driver does not set port
+* type within timeout.
+*/
+   schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+   if (!devlink_port_type_should_warn(devlink_port))
+   return;
+   cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
 /**
  * devlink_port_register - Register devlink port
  *
@@ -5419,6 +5452,8 @@ int devlink_port_register(struct devlink *devlink,
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
mutex_unlock(&devlink->lock);
+   INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+   devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
 }
@@ -5433,6 +5468,7 @@ void devlink_port_unregister(struct devlink_port 
*devlink_port)
 {
struct devlink *devlink = devlink_port->devlink;
 
+   devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
@@ -5446,6 +5482,7 @@ static void __devlink_port_type_set(struct devlink_port 
*devlink_port,
 {
if (WARN_ON(!devlink_port->registered))
return;
+   devlink_port_type_warn_cancel(devlink_port);
spin_lock(&devlink_port->type_lock);
devlink_port->type = type;
devlink_port->type_dev = type_dev;
@@ -5519,6 +5556,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
 void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+   devlink_port_type_warn_schedule(devlink_port);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
-- 
2.17.2

Re: [PATCH bpf-next 1/2] net: xdp: refactor XDP_QUERY_PROG{,_HW} to netdev

2019-05-23 Thread Toke Høiland-Jørgensen

Jakub Kicinski  writes:

> On Wed, 22 May 2019 22:54:44 +0200, Björn Töpel wrote:
>> > > Now, the same commands give:
>> > >
>> > >   # ip link set dev eth0 xdp obj foo.o sec main
>> > >   # ip link set dev eth0 xdpgeneric off
>> > >   Error: native and generic XDP can't be active at the same time.  
>> >
>> > I'm not clear why this change is necessary? It is a change in
>> > behaviour, and if anything returning ENOENT would seem cleaner
>> > in this case.
>> 
>> To me, the existing behavior was non-intuitive. If most people *don't*
>> agree, I'll remove this change. So, what do people think about this?
>> :-)
>
> Having things start to fail after they were successful/ignored
> is one of those ABI breakage types Linux and netdev usually takes
> pretty seriously, unfortunately.  Especially when motivation is 
> "it's more intuitive" :)
>
> If nobody chimes in please break out this behaviour change into 
> a commit of its own.

Björn and I already had this discussion off-list. I think we ended up
concluding that since it's not changing kernel *behaviour*, but just
making an existing error explicit, it might be acceptable from an ABI
breakage PoV. And I'd generally prefer explicit errors over silent
failures.

But yeah, can totally see why it could also be considered a breaking
change...

-Toke

[patch net-next 2/7] mlx5: Move firmware flash implementation to devlink

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Benefit from the devlink flash update implementation and ethtool
fallback to it and move firmware flash implementation there.

Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 --
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  | 35 ---
 .../mellanox/mlx5/core/ipoib/ethtool.c|  9 -
 .../net/ethernet/mellanox/mlx5/core/main.c| 20 +++
 4 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3a183d690e23..4e417dfe4ee5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1074,8 +1074,6 @@ u32 mlx5e_ethtool_get_rxfh_key_size(struct mlx5e_priv 
*priv);
 u32 mlx5e_ethtool_get_rxfh_indir_size(struct mlx5e_priv *priv);
 int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
  struct ethtool_ts_info *info);
-int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
-  struct ethtool_flash *flash);
 void mlx5e_ethtool_get_pauseparam(struct mlx5e_priv *priv,
  struct ethtool_pauseparam *pauseparam);
 int mlx5e_ethtool_set_pauseparam(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index dd764e0471f2..ea59097dd4f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1867,40 +1867,6 @@ static u32 mlx5e_get_priv_flags(struct net_device 
*netdev)
return priv->channels.params.pflags;
 }
 
-int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
-  struct ethtool_flash *flash)
-{
-   struct mlx5_core_dev *mdev = priv->mdev;
-   struct net_device *dev = priv->netdev;
-   const struct firmware *fw;
-   int err;
-
-   if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
-   return -EOPNOTSUPP;
-
-   err = request_firmware_direct(&fw, flash->data, &dev->dev);
-   if (err)
-   return err;
-
-   dev_hold(dev);
-   rtnl_unlock();
-
-   err = mlx5_firmware_flash(mdev, fw);
-   release_firmware(fw);
-
-   rtnl_lock();
-   dev_put(dev);
-   return err;
-}
-
-static int mlx5e_flash_device(struct net_device *dev,
- struct ethtool_flash *flash)
-{
-   struct mlx5e_priv *priv = netdev_priv(dev);
-
-   return mlx5e_ethtool_flash_device(priv, flash);
-}
-
 #ifndef CONFIG_MLX5_EN_RXNFC
 /* When CONFIG_MLX5_EN_RXNFC=n we only support ETHTOOL_GRXRINGS
  * otherwise this function will be defined from en_fs_ethtool.c
@@ -1939,7 +1905,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 #ifdef CONFIG_MLX5_EN_RXNFC
.set_rxnfc = mlx5e_set_rxnfc,
 #endif
-   .flash_device  = mlx5e_flash_device,
.get_tunable   = mlx5e_get_tunable,
.set_tunable   = mlx5e_set_tunable,
.get_pauseparam= mlx5e_get_pauseparam,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
index 90cb50fe17fd..ebd81f6b556e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -122,14 +122,6 @@ static int mlx5i_get_ts_info(struct net_device *netdev,
return mlx5e_ethtool_get_ts_info(priv, info);
 }
 
-static int mlx5i_flash_device(struct net_device *netdev,
- struct ethtool_flash *flash)
-{
-   struct mlx5e_priv *priv = mlx5i_epriv(netdev);
-
-   return mlx5e_ethtool_flash_device(priv, flash);
-}
-
 enum mlx5_ptys_width {
MLX5_PTYS_WIDTH_1X  = 1 << 0,
MLX5_PTYS_WIDTH_2X  = 1 << 1,
@@ -241,7 +233,6 @@ const struct ethtool_ops mlx5i_ethtool_ops = {
.get_ethtool_stats  = mlx5i_get_ethtool_stats,
.get_ringparam  = mlx5i_get_ringparam,
.set_ringparam  = mlx5i_set_ringparam,
-   .flash_device   = mlx5i_flash_device,
.get_channels   = mlx5i_get_channels,
.set_channels   = mlx5i_set_channels,
.get_coalesce   = mlx5i_get_coalesce,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 61fa1d162d28..36acd0267a13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1210,6 +1210,25 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, 
bool cleanup)
return err;
 }
 
+static int mlx5_devlink_flash_update(struct devlink *devlink,
+const char *file_name,
+const char *component,
+struct netlink_ext_ack *extack)
+{
+   struct mlx5_core_dev *dev = devlink

[patch net-next 0/7] expose flash update status to user

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

When user is flashing device using devlink, he currenly does not see any
information about what is going on, percentages, etc.
Drivers, for example mlxsw and mlx5, have notion about the progress
and what is happening. This patchset exposes this progress
information to userspace.

See this console recording which shows flashing FW on a Mellanox
Spectrum device:
https://asciinema.org/a/247926

Jiri Pirko (7):
  mlxsw: Move firmware flash implementation to devlink
  mlx5: Move firmware flash implementation to devlink
  mlxfw: Propagate error messages through extack
  devlink: allow driver to update progress of flash update
  mlxfw: Introduce status_notify op and call it to notify about the
status
  mlxsw: Implement flash update status notifications
  netdevsim: implement fake flash updating with notifications

 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   2 -
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  35 --
 drivers/net/ethernet/mellanox/mlx5/core/fw.c  |   6 +-
 .../mellanox/mlx5/core/ipoib/ethtool.c|   9 --
 .../net/ethernet/mellanox/mlx5/core/main.c|  20 
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |   3 +-
 drivers/net/ethernet/mellanox/mlxfw/mlxfw.h   |  11 +-
 .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c   |  57 --
 drivers/net/ethernet/mellanox/mlxsw/core.c|  15 +++
 drivers/net/ethernet/mellanox/mlxsw/core.h|   3 +
 .../net/ethernet/mellanox/mlxsw/spectrum.c|  75 +++--
 drivers/net/netdevsim/dev.c   |  35 ++
 include/net/devlink.h |   8 ++
 include/uapi/linux/devlink.h  |   5 +
 net/core/devlink.c| 102 ++
 15 files changed, 295 insertions(+), 91 deletions(-)

-- 
2.17.2

[patch net-next 1/7] mlxsw: Move firmware flash implementation to devlink

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Benefit from the devlink flash update implementation and ethtool
fallback to it and move firmware flash implementation there.

Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/core.c| 15 ++
 drivers/net/ethernet/mellanox/mlxsw/core.h|  3 ++
 .../net/ethernet/mellanox/mlxsw/spectrum.c| 49 +--
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c 
b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 6ee6de7f0160..a992a7c69b45 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1003,6 +1003,20 @@ static int mlxsw_devlink_core_bus_device_reload(struct 
devlink *devlink,
return err;
 }
 
+static int mlxsw_devlink_flash_update(struct devlink *devlink,
+ const char *file_name,
+ const char *component,
+ struct netlink_ext_ack *extack)
+{
+   struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+   struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+   if (!mlxsw_driver->flash_update)
+   return -EOPNOTSUPP;
+   return mlxsw_driver->flash_update(mlxsw_core, file_name,
+ component, extack);
+}
+
 static const struct devlink_ops mlxsw_devlink_ops = {
.reload = mlxsw_devlink_core_bus_device_reload,
.port_type_set  = mlxsw_devlink_port_type_set,
@@ -1019,6 +1033,7 @@ static const struct devlink_ops mlxsw_devlink_ops = {
.sb_occ_port_pool_get   = mlxsw_devlink_sb_occ_port_pool_get,
.sb_occ_tc_port_bind_get= mlxsw_devlink_sb_occ_tc_port_bind_get,
.info_get   = mlxsw_devlink_info_get,
+   .flash_update   = mlxsw_devlink_flash_update,
 };
 
 static int
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h 
b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e3832cb5bdda..a44ad0fb9477 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -284,6 +284,9 @@ struct mlxsw_driver {
   unsigned int sb_index, u16 tc_index,
   enum devlink_sb_pool_type pool_type,
   u32 *p_cur, u32 *p_max);
+   int (*flash_update)(struct mlxsw_core *mlxsw_core,
+   const char *file_name, const char *component,
+   struct netlink_ext_ack *extack);
void (*txhdr_construct)(struct sk_buff *skb,
const struct mlxsw_tx_info *tx_info);
int (*resources_register)(struct mlxsw_core *mlxsw_core);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index dbb425717f5e..861a77538859 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -388,6 +388,27 @@ static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp 
*mlxsw_sp)
return 0;
 }
 
+static int mlxsw_sp_flash_update(struct mlxsw_core *mlxsw_core,
+const char *file_name, const char *component,
+struct netlink_ext_ack *extack)
+{
+   struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+   const struct firmware *firmware;
+   int err;
+
+   if (component)
+   return -EOPNOTSUPP;
+
+   err = request_firmware_direct(&firmware, file_name,
+ mlxsw_sp->bus_info->dev);
+   if (err)
+   return err;
+   err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
+   release_firmware(firmware);
+
+   return err;
+}
+
 int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp,
  unsigned int counter_index, u64 *packets,
  u64 *bytes)
@@ -3155,31 +3176,6 @@ mlxsw_sp_port_set_link_ksettings(struct net_device *dev,
return 0;
 }
 
-static int mlxsw_sp_flash_device(struct net_device *dev,
-struct ethtool_flash *flash)
-{
-   struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
-   struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-   const struct firmware *firmware;
-   int err;
-
-   if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
-   return -EOPNOTSUPP;
-
-   dev_hold(dev);
-   rtnl_unlock();
-
-   err = request_firmware_direct(&firmware, flash->data, &dev->dev);
-   if (err)
-   goto out;
-   err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
-   release_firmware(firmware);
-out:
-   rtnl_lock();
-   dev_put(dev);
-   return err;
-}
-
 static int mlxsw_sp_get_module_info(struct net_device *netdev,

[patch net-next 3/7] mlxfw: Propagate error messages through extack

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Currently the error messages are printed to dmesg. Propagate them also
to directly to user doing the flashing through extack.

Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c  |  6 ++--
 .../net/ethernet/mellanox/mlx5/core/main.c|  2 +-
 .../ethernet/mellanox/mlx5/core/mlx5_core.h   |  3 +-
 drivers/net/ethernet/mellanox/mlxfw/mlxfw.h   |  7 ++--
 .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c   | 33 +--
 .../net/ethernet/mellanox/mlxsw/spectrum.c| 10 +++---
 6 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 1ab6f7e3bec6..e8fedb307b2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -552,7 +552,8 @@ static const struct mlxfw_dev_ops mlx5_mlxfw_dev_ops = {
 };
 
 int mlx5_firmware_flash(struct mlx5_core_dev *dev,
-   const struct firmware *firmware)
+   const struct firmware *firmware,
+   struct netlink_ext_ack *extack)
 {
struct mlx5_mlxfw_dev mlx5_mlxfw_dev = {
.mlxfw_dev = {
@@ -571,5 +572,6 @@ int mlx5_firmware_flash(struct mlx5_core_dev *dev,
return -EOPNOTSUPP;
}
 
-   return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, firmware);
+   return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev,
+   firmware, extack);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 36acd0267a13..52fcfc17fcc6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1226,7 +1226,7 @@ static int mlx5_devlink_flash_update(struct devlink 
*devlink,
if (err)
return err;
 
-   return mlx5_firmware_flash(dev, fw);
+   return mlx5_firmware_flash(dev, fw, extack);
 }
 
 static const struct devlink_ops mlx5_devlink_ops = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h 
b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 22e69d4813e4..d4dd8c1ae55c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -184,7 +184,8 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 
arm, u8 mode);
MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) &&  \
MLX5_CAP_MCAM_FEATURE((mdev), 
mtpps_enh_out_per_adj))
 
-int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw);
+int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw,
+   struct netlink_ext_ack *extack);
 
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h 
b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
index 14c0c62f8e73..83286b90593f 100644
--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
@@ -5,6 +5,7 @@
 #define _MLXFW_H
 
 #include 
+#include 
 
 enum mlxfw_fsm_state {
MLXFW_FSM_STATE_IDLE,
@@ -67,11 +68,13 @@ struct mlxfw_dev {
 
 #if IS_REACHABLE(CONFIG_MLXFW)
 int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
-const struct firmware *firmware);
+const struct firmware *firmware,
+struct netlink_ext_ack *extack);
 #else
 static inline
 int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
-const struct firmware *firmware)
+const struct firmware *firmware,
+struct netlink_ext_ack *extack)
 {
return -EOPNOTSUPP;
 }
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c 
b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
index 240c027e5f07..588d9a9c08c9 100644
--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
@@ -40,7 +40,8 @@ static const char * const mlxfw_fsm_state_err_str[] = {
 };
 
 static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
-   enum mlxfw_fsm_state fsm_state)
+   enum mlxfw_fsm_state fsm_state,
+   struct netlink_ext_ack *extack)
 {
enum mlxfw_fsm_state_err fsm_state_err;
enum mlxfw_fsm_state curr_fsm_state;
@@ -57,11 +58,13 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev 
*mlxfw_dev, u32 fwhandle,
if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) {
pr_err("Firmware flash failed: %s\n",
   mlxfw_fsm_state_err_str[fsm_state_err]);
+   NL_SET_ERR_MSG_MOD(extack, "Firmware flash failed");
return -EINVAL;
}
if (curr_fsm_state != fsm_state) {
if (--times == 0) {

[patch net-next 4/7] devlink: allow driver to update progress of flash update

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Introduce a function to be called from drivers during flash. It sends
notification to userspace about flash update progress.

Signed-off-by: Jiri Pirko 
---
 include/net/devlink.h|   8 +++
 include/uapi/linux/devlink.h |   5 ++
 net/core/devlink.c   | 102 +++
 3 files changed, 115 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 151eb930d329..8f65356132be 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -741,6 +741,14 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 enum devlink_health_reporter_state state);
 
+void devlink_flash_update_begin_notify(struct devlink *devlink);
+void devlink_flash_update_end_notify(struct devlink *devlink);
+void devlink_flash_update_status_notify(struct devlink *devlink,
+   const char *status_msg,
+   const char *component,
+   unsigned long done,
+   unsigned long total);
+
 #if IS_ENABLED(CONFIG_NET_DEVLINK)
 
 void devlink_compat_running_version(struct net_device *dev,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5bb4ea67d84f..5287b42c181f 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -104,6 +104,8 @@ enum devlink_command {
DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
DEVLINK_CMD_FLASH_UPDATE,
+   DEVLINK_CMD_FLASH_UPDATE_END,   /* notification only */
+   DEVLINK_CMD_FLASH_UPDATE_STATUS,/* notification only */
 
/* add new commands above here */
__DEVLINK_CMD_MAX,
@@ -331,6 +333,9 @@ enum devlink_attr {
 
DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,/* string */
DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,/* string */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,   /* string */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,  /* u64 */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */
 
/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9716a7f382cb..963178d32dda 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2673,6 +2673,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
 }
 
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+   struct devlink *devlink,
+   enum devlink_command cmd,
+   const char *status_msg,
+   const char *component,
+   unsigned long done, unsigned long total)
+{
+   void *hdr;
+
+   hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+   if (!hdr)
+   return -EMSGSIZE;
+
+   if (devlink_nl_put_handle(msg, devlink))
+   goto nla_put_failure;
+
+   if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+   goto out;
+
+   if (status_msg &&
+   nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+  status_msg))
+   goto nla_put_failure;
+   if (component &&
+   nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+  component))
+   goto nla_put_failure;
+   if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ done, DEVLINK_ATTR_PAD))
+   goto nla_put_failure;
+   if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ total, DEVLINK_ATTR_PAD))
+   goto nla_put_failure;
+
+out:
+   genlmsg_end(msg, hdr);
+   return 0;
+
+nla_put_failure:
+   genlmsg_cancel(msg, hdr);
+   return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+   struct sk_buff *msg;
+   int err;
+
+   WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+   cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+   cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+   msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+   if (!msg)
+   return;
+
+   err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
+  component, done, total);
+   if (err)
+   goto out_free_msg;
+
+   genlmsg_multicast_netns(&devlink_nl_fam

[patch net-next 6/7] mlxsw: Implement flash update status notifications

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Implement mlxfw status_notify op by passing notification down to
devlink. Also notify about flash update begin and end.

Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 639bb5778ff3..5ac893d9dd12 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -294,6 +294,19 @@ static void mlxsw_sp_fsm_release(struct mlxfw_dev 
*mlxfw_dev, u32 fwhandle)
mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
 }
 
+static void mlxsw_sp_status_notify(struct mlxfw_dev *mlxfw_dev,
+  const char *msg, const char *comp_name,
+  u32 done_bytes, u32 total_bytes)
+{
+   struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+   container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+   struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+
+   devlink_flash_update_status_notify(priv_to_devlink(mlxsw_sp->core),
+  msg, comp_name,
+  done_bytes, total_bytes);
+}
+
 static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = {
.component_query= mlxsw_sp_component_query,
.fsm_lock   = mlxsw_sp_fsm_lock,
@@ -303,7 +316,8 @@ static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = {
.fsm_activate   = mlxsw_sp_fsm_activate,
.fsm_query_state= mlxsw_sp_fsm_query_state,
.fsm_cancel = mlxsw_sp_fsm_cancel,
-   .fsm_release= mlxsw_sp_fsm_release
+   .fsm_release= mlxsw_sp_fsm_release,
+   .status_notify  = mlxsw_sp_status_notify,
 };
 
 static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp,
@@ -321,8 +335,10 @@ static int mlxsw_sp_firmware_flash(struct mlxsw_sp 
*mlxsw_sp,
int err;
 
mlxsw_core_fw_flash_start(mlxsw_sp->core);
+   devlink_flash_update_begin_notify(priv_to_devlink(mlxsw_sp->core));
err = mlxfw_firmware_flash(&mlxsw_sp_mlxfw_dev.mlxfw_dev,
   firmware, extack);
+   devlink_flash_update_end_notify(priv_to_devlink(mlxsw_sp->core));
mlxsw_core_fw_flash_end(mlxsw_sp->core);
 
return err;
-- 
2.17.2

[patch net-next 5/7] mlxfw: Introduce status_notify op and call it to notify about the status

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Add new op status_notify which is called to update the user about
flashing status.

Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxfw/mlxfw.h   |  4 
 .../net/ethernet/mellanox/mlxfw/mlxfw_fsm.c   | 24 +++
 2 files changed, 28 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h 
b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
index 83286b90593f..c50e74ab02c4 100644
--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
@@ -58,6 +58,10 @@ struct mlxfw_dev_ops {
void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
 
void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+
+   void (*status_notify)(struct mlxfw_dev *mlxfw_dev,
+ const char *msg, const char *comp_name,
+ u32 done_bytes, u32 total_bytes);
 };
 
 struct mlxfw_dev {
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c 
b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
index 588d9a9c08c9..6a18ec05181a 100644
--- a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
@@ -39,6 +39,16 @@ static const char * const mlxfw_fsm_state_err_str[] = {
"unknown error"
 };
 
+static void mlxfw_status_notify(struct mlxfw_dev *mlxfw_dev,
+   const char *msg, const char *comp_name,
+   u32 done_bytes, u32 total_bytes)
+{
+   if (!mlxfw_dev->ops->status_notify)
+   return;
+   mlxfw_dev->ops->status_notify(mlxfw_dev, msg, comp_name,
+ done_bytes, total_bytes);
+}
+
 static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
enum mlxfw_fsm_state fsm_state,
struct netlink_ext_ack *extack)
@@ -85,11 +95,14 @@ static int mlxfw_flash_component(struct mlxfw_dev 
*mlxfw_dev,
u16 comp_max_write_size;
u8 comp_align_bits;
u32 comp_max_size;
+   char comp_name[8];
u16 block_size;
u8 *block_ptr;
u32 offset;
int err;
 
+   sprintf(comp_name, "%u", comp->index);
+
err = mlxfw_dev->ops->component_query(mlxfw_dev, comp->index,
  &comp_max_size, &comp_align_bits,
  &comp_max_write_size);
@@ -108,6 +121,7 @@ static int mlxfw_flash_component(struct mlxfw_dev 
*mlxfw_dev,
   comp_align_bits);
 
pr_debug("Component update\n");
+   mlxfw_status_notify(mlxfw_dev, "Updating component", comp_name, 0, 0);
err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle,
   comp->index,
   comp->data_size);
@@ -120,6 +134,8 @@ static int mlxfw_flash_component(struct mlxfw_dev 
*mlxfw_dev,
goto err_out;
 
pr_debug("Component download\n");
+   mlxfw_status_notify(mlxfw_dev, "Downloading component",
+   comp_name, 0, comp->data_size);
for (offset = 0;
 offset < MLXFW_ALIGN_UP(comp->data_size, comp_align_bits);
 offset += comp_max_write_size) {
@@ -131,9 +147,13 @@ static int mlxfw_flash_component(struct mlxfw_dev 
*mlxfw_dev,
 offset);
if (err)
goto err_out;
+   mlxfw_status_notify(mlxfw_dev, "Downloading component",
+   comp_name, offset + block_size,
+   comp->data_size);
}
 
pr_debug("Component verify\n");
+   mlxfw_status_notify(mlxfw_dev, "Verifying component", comp_name, 0, 0);
err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle,
   comp->index);
if (err)
@@ -203,6 +223,8 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
return PTR_ERR(mfa2_file);
 
pr_info("Initialize firmware flash process\n");
+   mlxfw_status_notify(mlxfw_dev, "Initializing firmware flash process",
+   NULL, 0, 0);
err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle);
if (err) {
pr_err("Could not lock the firmware FSM\n");
@@ -220,6 +242,7 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
goto err_flash_components;
 
pr_debug("Activate image\n");
+   mlxfw_status_notify(mlxfw_dev, "Activating image", NULL, 0, 0);
err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle);
if (err) {
pr_err("Could not activate the downloaded image\n");
@@ -236,6 +259,7 @@ int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
mlxfw_dev->ops->fsm_rele

[patch net-next 7/7] netdevsim: implement fake flash updating with notifications

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Signed-off-by: Jiri Pirko 
---
 drivers/net/netdevsim/dev.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index b509b941d5ca..c15b86f9cd2b 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -220,8 +220,43 @@ static int nsim_dev_reload(struct devlink *devlink,
return 0;
 }
 
+#define NSIM_DEV_FLASH_SIZE 50
+#define NSIM_DEV_FLASH_CHUNK_SIZE 1000
+#define NSIM_DEV_FLASH_CHUNK_TIME_MS 10
+
+static int nsim_dev_flash_update(struct devlink *devlink, const char 
*file_name,
+const char *component,
+struct netlink_ext_ack *extack)
+{
+   int i;
+
+   devlink_flash_update_begin_notify(devlink);
+
+   devlink_flash_update_status_notify(devlink, "Preparing to flash",
+  component, 0, 0);
+   for (i = 0; i < NSIM_DEV_FLASH_SIZE / NSIM_DEV_FLASH_CHUNK_SIZE; i++) {
+   devlink_flash_update_status_notify(devlink, "Flashing",
+  component,
+  i * 
NSIM_DEV_FLASH_CHUNK_SIZE,
+  NSIM_DEV_FLASH_SIZE);
+   msleep(NSIM_DEV_FLASH_CHUNK_TIME_MS);
+   }
+   devlink_flash_update_status_notify(devlink, "Flashing",
+  component,
+  NSIM_DEV_FLASH_SIZE,
+  NSIM_DEV_FLASH_SIZE);
+
+   devlink_flash_update_status_notify(devlink, "Flashing done",
+  component, 0, 0);
+
+   devlink_flash_update_end_notify(devlink);
+
+   return 0;
+}
+
 static const struct devlink_ops nsim_dev_devlink_ops = {
.reload = nsim_dev_reload,
+   .flash_update = nsim_dev_flash_update,
 };
 
 static struct nsim_dev *
-- 
2.17.2

[patch iproute2 2/3] devlink: implement flash update status monitoring

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Kernel sends notifications about flash update status, so implement these
messages for monitoring.

Signed-off-by: Jiri Pirko 
---
 devlink/devlink.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 436935f88bda..55cbc01189db 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -414,6 +414,10 @@ static const enum mnl_attr_data_type 
devlink_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT] = MNL_TYPE_U64,
[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS] = MNL_TYPE_U64,
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = MNL_TYPE_U64,
+   [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = MNL_TYPE_STRING,
+   [DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG] = MNL_TYPE_STRING,
+   [DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE] = MNL_TYPE_U64,
+   [DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL] = MNL_TYPE_U64,
 };
 
 static int attr_cb(const struct nlattr *attr, void *data)
@@ -3712,6 +3716,9 @@ static const char *cmd_name(uint8_t cmd)
case DEVLINK_CMD_REGION_SET: return "set";
case DEVLINK_CMD_REGION_NEW: return "new";
case DEVLINK_CMD_REGION_DEL: return "del";
+   case DEVLINK_CMD_FLASH_UPDATE: return "begin";
+   case DEVLINK_CMD_FLASH_UPDATE_END: return "end";
+   case DEVLINK_CMD_FLASH_UPDATE_STATUS: return "status";
default: return "";
}
 }
@@ -3740,6 +3747,10 @@ static const char *cmd_obj(uint8_t cmd)
case DEVLINK_CMD_REGION_NEW:
case DEVLINK_CMD_REGION_DEL:
return "region";
+   case DEVLINK_CMD_FLASH_UPDATE:
+   case DEVLINK_CMD_FLASH_UPDATE_END:
+   case DEVLINK_CMD_FLASH_UPDATE_STATUS:
+   return "flash";
default: return "";
}
 }
@@ -3764,6 +3775,29 @@ static bool cmd_filter_check(struct dl *dl, uint8_t cmd)
return false;
 }
 
+static void pr_out_flash_update(struct dl *dl, struct nlattr **tb)
+{
+   __pr_out_handle_start(dl, tb, true, false);
+
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG])
+   pr_out_str(dl, "msg",
+  
mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG]));
+
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT])
+   pr_out_str(dl, "component",
+  
mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]));
+
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE])
+   pr_out_u64(dl, "done",
+  
mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE]));
+
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL])
+   pr_out_u64(dl, "total",
+  
mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL]));
+
+   pr_out_handle_end(dl);
+}
+
 static void pr_out_region(struct dl *dl, struct nlattr **tb);
 
 static int cmd_mon_show_cb(const struct nlmsghdr *nlh, void *data)
@@ -3820,6 +3854,15 @@ static int cmd_mon_show_cb(const struct nlmsghdr *nlh, 
void *data)
pr_out_mon_header(genl->cmd);
pr_out_region(dl, tb);
break;
+   case DEVLINK_CMD_FLASH_UPDATE: /* fall through */
+   case DEVLINK_CMD_FLASH_UPDATE_END: /* fall through */
+   case DEVLINK_CMD_FLASH_UPDATE_STATUS:
+   mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb);
+   if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME])
+   return MNL_CB_ERROR;
+   pr_out_mon_header(genl->cmd);
+   pr_out_flash_update(dl, tb);
+   break;
}
return MNL_CB_OK;
 }
-- 
2.17.2

[patch iproute2 3/3] devlink: implement flash status monitoring

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Listen to status notifications coming from kernel during flashing and
put them on stdout to inform user about the status.

Signed-off-by: Jiri Pirko 
---
 devlink/devlink.c | 209 +-
 devlink/mnlg.c|   5 ++
 devlink/mnlg.h|   1 +
 3 files changed, 211 insertions(+), 4 deletions(-)

diff --git a/devlink/devlink.c b/devlink/devlink.c
index 55cbc01189db..8078e8801e98 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "SNAPSHOT.h"
 #include "list.h"
@@ -68,6 +69,12 @@ static int g_new_line_count;
g_new_line_count = 0;   \
} while (0)
 
+#define pr_out_tty(args...)\
+   do {\
+   if (isatty(STDOUT_FILENO))  \
+   fprintf(stdout, ##args);\
+   } while (0)
+
 static int g_indent_level;
 static bool g_indent_newline;
 #define INDENT_STR_STEP 2
@@ -113,9 +120,8 @@ static int _mnlg_socket_recv_run(struct mnlg_socket *nlg,
return 0;
 }
 
-static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg,
-  const struct nlmsghdr *nlh,
-  mnl_cb_t data_cb, void *data)
+static int _mnlg_socket_send(struct mnlg_socket *nlg,
+const struct nlmsghdr *nlh)
 {
int err;
 
@@ -124,6 +130,18 @@ static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg,
pr_err("Failed to call mnlg_socket_send\n");
return -errno;
}
+   return 0;
+}
+
+static int _mnlg_socket_sndrcv(struct mnlg_socket *nlg,
+  const struct nlmsghdr *nlh,
+  mnl_cb_t data_cb, void *data)
+{
+   int err;
+
+   err = _mnlg_socket_send(nlg, nlh);
+   if (err)
+   return err;
return _mnlg_socket_recv_run(nlg, data_cb, data);
 }
 
@@ -2697,9 +2715,151 @@ static void cmd_dev_flash_help(void)
pr_err("Usage: devlink dev flash DEV file PATH [ component NAME ]\n");
 }
 
+
+struct cmd_dev_flash_status_ctx {
+   struct dl *dl;
+   char *last_msg;
+   char *last_component;
+   uint8_t not_first:1,
+   last_pc:1,
+   received_end:1,
+   flash_done:1;
+};
+
+static int nullstrcmp(const char *str1, const char *str2)
+{
+   if (str1 && str2)
+   return strcmp(str1, str2);
+   if (!str1 && !str2)
+   return 0;
+   return str1 ? 1 : -1;
+}
+
+static int cmd_dev_flash_status_cb(const struct nlmsghdr *nlh, void *data)
+{
+   struct cmd_dev_flash_status_ctx *ctx = data;
+   struct dl_opts *opts = &ctx->dl->opts;
+   struct genlmsghdr *genl = mnl_nlmsg_get_payload(nlh);
+   struct nlattr *tb[DEVLINK_ATTR_MAX + 1] = {};
+   const char *component = NULL;
+   uint64_t done = 0, total = 0;
+   const char *msg = NULL;
+   const char *bus_name;
+   const char *dev_name;
+
+   if (genl->cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS &&
+   genl->cmd != DEVLINK_CMD_FLASH_UPDATE_END)
+   return MNL_CB_STOP;
+
+   mnl_attr_parse(nlh, sizeof(*genl), attr_cb, tb);
+   if (!tb[DEVLINK_ATTR_BUS_NAME] || !tb[DEVLINK_ATTR_DEV_NAME])
+   return MNL_CB_ERROR;
+   bus_name = mnl_attr_get_str(tb[DEVLINK_ATTR_BUS_NAME]);
+   dev_name = mnl_attr_get_str(tb[DEVLINK_ATTR_DEV_NAME]);
+   if (strcmp(bus_name, opts->bus_name) ||
+   strcmp(dev_name, opts->dev_name))
+   return MNL_CB_ERROR;
+
+   if (genl->cmd == DEVLINK_CMD_FLASH_UPDATE_END && ctx->not_first) {
+   pr_out("\n");
+   free(ctx->last_msg);
+   free(ctx->last_component);
+   ctx->received_end = 1;
+   return MNL_CB_STOP;
+   }
+
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG])
+   msg = 
mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG]);
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT])
+   component = 
mnl_attr_get_str(tb[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]);
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE])
+   done = 
mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE]);
+   if (tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL])
+   total = 
mnl_attr_get_u64(tb[DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL]);
+
+   if (!nullstrcmp(msg, ctx->last_msg) &&
+   !nullstrcmp(component, ctx->last_component) &&
+   ctx->last_pc && ctx->not_first) {
+   pr_out_tty("\b\b\b\b\b"); /* clean percentage */
+   } else {
+   if (ctx->not_first)
+   pr_out("\n");
+   if (component) {
+   pr_out("[%s] ", component);
+   free(ctx->last_component);
+

[patch iproute2 1/3] header update

2019-05-23 Thread Jiri Pirko

From: Jiri Pirko 

Signed-off-by: Jiri Pirko 
---
 include/uapi/linux/devlink.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 3b6a9e6be3ac..6544824a0b97 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -104,6 +104,8 @@ enum devlink_command {
DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
 
DEVLINK_CMD_FLASH_UPDATE,
+   DEVLINK_CMD_FLASH_UPDATE_END,   /* notification only */
+   DEVLINK_CMD_FLASH_UPDATE_STATUS,/* notification only */
 
/* add new commands above here */
__DEVLINK_CMD_MAX,
@@ -331,6 +333,9 @@ enum devlink_attr {
 
DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME,/* string */
DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,/* string */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,   /* string */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,  /* u64 */
+   DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */
 
/* add new attributes above here, update the policy in devlink.c */
 
-- 
2.17.2

[PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options

2019-05-23 Thread Maxime Ripard

The networking PHYs have a number of available device tree properties that
can be used in their device tree node. Add a YAML schemas for those.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +-
 Documentation/devicetree/bindings/net/phy.txt   |  80 +-
 2 files changed, 149 insertions(+), 79 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml

diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml 
b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
new file mode 100644
index ..eb79ee6db977
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-phy.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet PHY Generic Binding
+
+maintainers:
+  - David S. Miller 
+
+properties:
+  $nodename:
+pattern: "^ethernet-phy(@[a-f0-9])?$"
+
+  compatible:
+oneOf:
+  - const: ethernet-phy-ieee802.3-c22
+description: PHYs that implement IEEE802.3 clause 22
+  - const: ethernet-phy-ieee802.3-c45
+description: PHYs that implement IEEE802.3 clause 45
+  - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
+description:
+  The first group of digits is the 16 bit Phy Identifier 1
+  register, this is the chip vendor OUI bits 3:18. The
+  second group of digits is the Phy Identifier 2 register,
+  this is the chip vendor OUI bits 19:24, followed by 10
+  bits of a vendor specific ID.
+
+  reg:
+maxItems: 1
+minimum: 0
+maximum: 31
+description:
+  The ID number for the PHY.
+
+  interrupts:
+maxItems: 1
+
+  max-speed:
+enum:
+  - 10
+  - 100
+  - 1000
+description:
+  Maximum PHY supported speed in Mbits / seconds.
+
+  broken-turn-around:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  If set, indicates the PHY device does not correctly release
+  the turn around line low at the end of a MDIO transaction.
+
+  enet-phy-lane-swap:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  If set, indicates the PHY will swap the TX/RX lanes to
+  compensate for the board being designed with the lanes
+  swapped.
+
+  eee-broken-100tx:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  eee-broken-1000t:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gt:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  eee-broken-1000kx:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gkx4:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  eee-broken-10gkr:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  Mark the corresponding energy efficient ethernet mode as
+  broken and request the ethernet to stop advertising it.
+
+  phy-is-integrated:
+$ref: /schemas/types.yaml#definitions/flag
+description:
+  If set, indicates that the PHY is integrated into the same
+  physical package as the Ethernet MAC. If needed, muxers
+  should be configured to ensure the integrated PHY is
+  used. The absence of this property indicates the muxers
+  should be configured so that the external PHY is used.
+
+  resets:
+maxItems: 1
+
+  reset-names:
+const: phy
+
+  reset-gpios:
+description:
+  The GPIO phandle and specifier for the PHY reset signal.
+
+  reset-assert-us:
+description:
+  Delay after the reset was asserted in microseconds. If this
+  property is missing the delay will be skipped.
+
+  reset-deassert-us:
+description:
+  Delay after the reset was deasserted in microseconds. If
+  this property is missing the delay will be skipped.
+
+required:
+  - reg
+  - interrupts
+
+examples:
+  - |
+ethernet {
+#address-cells = <1>;
+#size-cells = <0>;
+
+ethernet-phy@0 {
+compatible = "ethernet-phy-id0141.0e90", 
"ethernet-phy-ieee802.3-c22";
+interrupt-parent = <&PIC>;
+interrupts = <35 1>;
+reg = <0>;
+
+resets = <&rst 8>;
+r

[PATCH 3/8] dt-bindings: net: phy: The interrupt property is not mandatory

2019-05-23 Thread Maxime Ripard

Unlike what was initially claimed in the PHY binding, the interrupt
property of a PHY can be omitted, and the OS will turn to polling instead.

Document that.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/ethernet-phy.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml 
b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
index eb79ee6db977..d2cc4b46f6dc 100644
--- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -125,7 +125,6 @@ properties:
 
 required:
   - reg
-  - interrupts
 
 examples:
   - |
-- 
git-series 0.9.1

[PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options

2019-05-23 Thread Maxime Ripard

The Ethernet controllers have a good number of generic options that can be
needed in a device tree. Add a YAML schemas for those.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/ethernet-controller.yaml | 197 +++-
 Documentation/devicetree/bindings/net/ethernet.txt |  68 +--
 Documentation/devicetree/bindings/net/fixed-link.txt   |  55 +--
 3 files changed, 199 insertions(+), 121 deletions(-)
 create mode 100644 
Documentation/devicetree/bindings/net/ethernet-controller.yaml

diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml 
b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
new file mode 100644
index ..1c6e9e755481
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet Controller Generic Binding
+
+maintainers:
+  - David S. Miller 
+
+properties:
+  $nodename:
+pattern: "^ethernet(@.*)?$"
+
+  local-mac-address:
+allOf:
+  - $ref: /schemas/types.yaml#definitions/uint8-array
+  - minItems: 6
+maxItems: 6
+description:
+  Specifies the MAC address that was assigned to the network device.
+
+  mac-address:
+allOf:
+  - $ref: /schemas/types.yaml#definitions/uint8-array
+  - minItems: 6
+maxItems: 6
+description:
+  Specifies the MAC address that was last used by the boot
+  program; should be used in cases where the MAC address assigned
+  to the device by the boot program is different from the
+  local-mac-address property.
+
+  max-frame-size:
+$ref: /schemas/types.yaml#definitions/uint32
+description:
+  Maximum transfer unit (IEEE defined MTU), rather than the
+  maximum frame size (there\'s contradiction in the Devicetree
+  Specification).
+
+  max-speed:
+$ref: /schemas/types.yaml#definitions/uint32
+description:
+  Specifies maximum speed in Mbit/s supported by the device.
+
+  nvmem-cells:
+maxItems: 1
+description:
+  Reference to an nvmem node for the MAC address
+
+  nvmem-cells-names:
+const: mac-address
+
+  phy-connection-type:
+description:
+  Operation mode of the PHY interface
+oneOf:
+  - const: internal
+description:
+  there is not a standard bus between the MAC and the PHY,
+  something proprietary is being used to embed the PHY in the
+  MAC.
+  - const: mii
+  - const: gmii
+  - const: sgmii
+  - const: qsgmii
+  - const: tbi
+  - const: rev-mii
+  - const: rmii
+  - const: rgmii
+description:
+  RX and TX delays are added by the MAC when required
+  - const: rgmii-id
+description:
+  RGMII with internal RX and TX delays provided by the PHY,
+  the MAC should not add the RX or TX delays in this case
+  - const: rgmii-rxid
+description:
+  RGMII with internal RX delay provided by the PHY, the MAC
+  should not add an RX delay in this case
+  - const: rgmii-txid
+description:
+  RGMII with internal TX delay provided by the PHY, the MAC
+  should not add an TX delay in this case
+  - const: rtbi
+  - const: smii
+  - const: xgmii
+  - const: trgmii
+  - const: 1000base-x
+  - const: 2500base-x
+  - const: rxaui
+  - const: xaui
+  - const: 10gbase-kr
+description:
+  10GBASE-KR, XFI, SFI
+
+  phy-mode:
+$ref: "#/properties/phy-connection-type"
+description:
+  Deprecated in favor of phy-connection-type
+
+  phy-handle:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  Specifies a reference to a node representing a PHY device.
+
+  phy:
+$ref: "#/properties/phy-handle"
+description:
+  Deprecated in favor of phy-handle
+
+  phy-device:
+$ref: "#/properties/phy-handle"
+description:
+  Deprecated in favor of phy-handle
+
+  rx-fifo-depth:
+$ref: /schemas/types.yaml#definitions/uint32
+description:
+  The size of the controller\'s receive fifo in bytes. This is used
+  for components that can have configurable receive fifo sizes,
+  and is useful for determining certain configuration settings
+  such as flow control thresholds.
+
+  tx-fifo-depth:
+$ref: /schemas/types.yaml#definitions/uint32
+description:
+  The size of the controller\'s transmit fifo in bytes. This
+  is used for components that can have configurable fifo sizes.
+
+  managed:
+allOf:
+  - $ref: /schemas/types.yaml#definitions/string
+  - default: auto
+enum:
+  - auto
+  - in-band-status
+description:
+  Specifies the PHY management type. If auto is set and fixed-link
+  is not specified, it us

[PATCH 4/8] dt-bindings: net: sun4i-emac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Switch our Allwinner A10 EMAC controller binding to a YAML schema to enable
the DT validation.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml | 55 
+++
 Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt  | 19 
---
 2 files changed, 55 insertions(+), 19 deletions(-)
 create mode 100644 
Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
 delete mode 100644 
Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt

diff --git 
a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml 
b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
new file mode 100644
index ..b5d82d0a59d8
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-emac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 EMAC Ethernet Controller Device Tree Bindings
+
+allOf:
+  - $ref: "ethernet-controller.yaml#"
+
+maintainers:
+  - Chen-Yu Tsai 
+  - Maxime Ripard 
+
+properties:
+  compatible:
+const: allwinner,sun4i-a10-emac
+
+  reg:
+maxItems: 1
+
+  interrupts:
+maxItems: 1
+
+  clocks:
+maxItems: 1
+
+  allwinner,sram:
+description: Phandle to the device SRAM
+$ref: /schemas/types.yaml#/definitions/phandle-array
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - phy
+  - allwinner,sram
+
+examples:
+  - |
+emac: ethernet@1c0b000 {
+compatible = "allwinner,sun4i-a10-emac";
+reg = <0x01c0b000 0x1000>;
+interrupts = <55>;
+clocks = <&ahb_gates 17>;
+phy = <&phy0>;
+};
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt 
b/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
deleted file mode 100644
index e98118aef5f6..
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-* Allwinner EMAC ethernet controller
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-emac" (Deprecated:
-  "allwinner,sun4i-emac")
-- reg: address and length of the register set for the device.
-- interrupts: interrupt for the device
-- phy: see ethernet.txt file in the same directory.
-- clocks: A phandle to the reference clock for this device
-
-Example:
-
-emac: ethernet@1c0b000 {
-   compatible = "allwinner,sun4i-a10-emac";
-   reg = <0x01c0b000 0x1000>;
-   interrupts = <55>;
-   clocks = <&ahb_gates 17>;
-   phy = <&phy0>;
-};
-- 
git-series 0.9.1

[PATCH 5/8] dt-bindings: net: sun4i-mdio: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Switch our Allwinner A10 MDIO controller binding to a YAML schema to enable
the DT validation.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml | 55 
+++
 Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt  | 27 
---
 2 files changed, 55 insertions(+), 27 deletions(-)
 create mode 100644 
Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
 delete mode 100644 
Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt

diff --git 
a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml 
b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
new file mode 100644
index ..32c0fdc57d35
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-mdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MDIO Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai 
+  - Maxime Ripard 
+
+properties:
+  "#address-cells":
+const: 1
+
+  "#size-cells":
+const: 0
+
+  compatible:
+enum:
+  - allwinner,sun4i-a10-mdio
+
+  # Deprecated
+  - allwinner,sun4i-mdio
+
+  reg:
+maxItems: 1
+
+  phy-supply:
+description: PHY regulator
+
+required:
+  - compatible
+  - reg
+
+examples:
+  - |
+mdio@1c0b080 {
+compatible = "allwinner,sun4i-a10-mdio";
+reg = <0x01c0b080 0x14>;
+#address-cells = <1>;
+#size-cells = <0>;
+phy-supply = <®_emac_3v3>;
+
+phy0: ethernet-phy@0 {
+reg = <0>;
+};
+};
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt 
b/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
deleted file mode 100644
index ab5b8613b0ef..
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner A10 MDIO Ethernet Controller interface
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-mdio"
-  (Deprecated: "allwinner,sun4i-mdio").
-- reg: address and length of the register set for the device.
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Example at the SoC level:
-mdio@1c0b080 {
-   compatible = "allwinner,sun4i-a10-mdio";
-   reg = <0x01c0b080 0x14>;
-   #address-cells = <1>;
-   #size-cells = <0>;
-};
-
-And at the board level:
-
-mdio@1c0b080 {
-   phy-supply = <®_emac_3v3>;
-
-   phy0: ethernet-phy@0 {
-   reg = <0>;
-   };
-};
-- 
git-series 0.9.1

[PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML
schema to enable the DT validation.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++-
 Documentation/devicetree/bindings/net/stmmac.txt  | 179 +--
 2 files changed, 345 insertions(+), 178 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml

diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml 
b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
new file mode 100644
index ..be3ada5121e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/snps,dwmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DesignWare MAC Device Tree Bindings
+
+maintainers:
+  - Alexandre Torgue 
+  - Giuseppe Cavallaro 
+  - Jose Abreu 
+
+properties:
+  compatible:
+oneOf:
+  - const: snps,dwmac
+  - const: snps,dwmac-3.50a
+  - const: snps,dwmac-3.610
+  - const: snps,dwmac-3.70a
+  - const: snps,dwmac-3.710
+  - const: snps,dwmac-4.00
+  - const: snps,dwmac-4.10a
+  - const: snps,dwxgmac
+  - const: snps,dwxgmac-2.10
+  - const: st,spear600-gmac
+description: Deprecated
+
+  reg:
+maxItems: 1
+
+  interrupts:
+minItems: 1
+maxItems: 3
+items:
+  - description: Combined signal for various interrupt events
+  - description: The interrupt to manage the remote wake-up packet 
detection
+  - description: The interrupt that occurs when Rx exits the LPI state
+
+  interrupt-names:
+minItems: 1
+maxItems: 3
+items:
+  - const: macirq
+  - const: eth_wake_irq
+  - const: eth_lpi
+
+  clocks:
+minItems: 1
+maxItems: 3
+items:
+  - description: GMAC main clock
+  - description: Peripheral registers interface clock
+  - description:
+  PTP reference clock. This clock is used for programming the
+  Timestamp Addend Register. If not passed then the system
+  clock will be used and this is fine on some platforms.
+
+  clock-names:
+additionalItems: true
+contains:
+  enum:
+- stmmaceth
+- pclk
+- ptp_ref
+
+  resets:
+maxItems: 1
+description:
+  MAC Reset signal.
+
+  reset-names:
+const: stmmaceth
+
+  snps,axi-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  AXI BUS Mode parameters. Phandle to a node that can contain the
+  following properties
+* snps,lpi_en, enable Low Power Interface
+* snps,xit_frm, unlock on WoL
+* snps,wr_osr_lmt, max write outstanding req. limit
+* snps,rd_osr_lmt, max read outstanding req. limit
+* snps,kbbe, do not cross 1KiB boundary.
+* snps,blen, this is a vector of supported burst length.
+* snps,fb, fixed-burst
+* snps,mb, mixed-burst
+* snps,rb, rebuild INCRx Burst
+
+  snps,mtl-rx-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  Multiple RX Queues parameters. Phandle to a node that can
+  contain the following properties
+* snps,rx-queues-to-use, number of RX queues to be used in the
+  driver
+* Choose one of these RX scheduling algorithms
+  * snps,rx-sched-sp, Strict priority
+  * snps,rx-sched-wsp, Weighted Strict priority
+* For each RX queue
+  * Choose one of these modes
+* snps,dcb-algorithm, Queue to be enabled as DCB
+* snps,avb-algorithm, Queue to be enabled as AVB
+  * snps,map-to-dma-channel, Channel to map
+  * Specifiy specific packet routing
+* snps,route-avcp, AV Untagged Control packets
+* snps,route-ptp, PTP Packets
+* snps,route-dcbcp, DCB Control Packets
+* snps,route-up, Untagged Packets
+* snps,route-multi-broad, Multicast & Broadcast Packets
+  * snps,priority, RX queue priority (Range 0x0 to 0xF)
+
+  snps,mtl-tx-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  Multiple TX Queues parameters. Phandle to a node that can
+  contain the following properties
+* snps,tx-queues-to-use, number of TX queues to be used in the
+  driver
+* Choose one of these TX scheduling algorithms
+  * snps,tx-sched-wrr, Weighted Round Robin
+  * snps,tx-sched-wfq, Weighted Fair Queuing
+  * snps,tx-sched-dwrr, Deficit Weighted Round Robin
+  * snps,tx-sched-sp, Strict priority
+* For each TX queue
+  * snps,weight, TX queue weight (if using a DCB weight
+algorithm)
+  * Choose one of these modes
+* snps,dcb-algorithm, TX queue will be working in DCB
+* snps,avb-algorithm, TX queue will

[PATCH 8/8] dt-bindings: net: sun8i-emac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Switch our Allwinner H3 EMAC controller binding to a YAML schema to enable
the DT validation. Since that controller is based on a Synopsys IP, let's
add the validation to that schemas with a bunch of conditionals.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 201 +--
 Documentation/devicetree/bindings/net/snps,dwmac.yaml | 342 +++-
 2 files changed, 342 insertions(+), 201 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/net/dwmac-sun8i.txt

diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt 
b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
deleted file mode 100644
index 54c66d0611cb..
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ /dev/null
@@ -1,201 +0,0 @@
-* Allwinner sun8i GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
-- compatible: must be one of the following string:
-   "allwinner,sun8i-a83t-emac"
-   "allwinner,sun8i-h3-emac"
-   "allwinner,sun8i-r40-gmac"
-   "allwinner,sun8i-v3s-emac"
-   "allwinner,sun50i-a64-emac"
-   "allwinner,sun50i-h6-emac", "allwinner-sun50i-a64-emac"
-- reg: address and length of the register for the device.
-- interrupts: interrupt for the device
-- interrupt-names: must be "macirq"
-- clocks: A phandle to the reference clock for this device
-- clock-names: must be "stmmaceth"
-- resets: A phandle to the reset control for this device
-- reset-names: must be "stmmaceth"
-- phy-mode: See ethernet.txt
-- phy-handle: See ethernet.txt
-- syscon: A phandle to the device containing the EMAC or GMAC clock register
-
-Optional properties:
-- allwinner,tx-delay-ps: TX clock delay chain value in ps.
-Range is 0-700. Default is 0.
-Unavailable for allwinner,sun8i-r40-gmac
-- allwinner,rx-delay-ps: RX clock delay chain value in ps.
-Range is 0-3100. Default is 0.
-Range is 0-700 for allwinner,sun8i-r40-gmac
-Both delay properties need to be a multiple of 100. They control the
-clock delay for external RGMII PHY. They do not apply to the internal
-PHY or external non-RGMII PHYs.
-
-Optional properties for the following compatibles:
-  - "allwinner,sun8i-h3-emac",
-  - "allwinner,sun8i-v3s-emac":
-- allwinner,leds-active-low: EPHY LEDs are active low
-
-Required child node of emac:
-- mdio bus node: should be named mdio with compatible "snps,dwmac-mdio"
-
-Required properties of the mdio node:
-- #address-cells: shall be 1
-- #size-cells: shall be 0
-
-The device node referenced by "phy" or "phy-handle" must be a child node
-of the mdio node. See phy.txt for the generic PHY bindings.
-
-The following compatibles require that the emac node have a mdio-mux child
-node called "mdio-mux":
-  - "allwinner,sun8i-h3-emac"
-  - "allwinner,sun8i-v3s-emac":
-Required properties for the mdio-mux node:
-  - compatible = "allwinner,sun8i-h3-mdio-mux"
-  - mdio-parent-bus: a phandle to EMAC mdio
-  - one child mdio for the integrated mdio with the compatible
-"allwinner,sun8i-h3-mdio-internal"
-  - one child mdio for the external mdio if present (V3s have none)
-Required properties for the mdio-mux children node:
-  - reg: 1 for internal MDIO bus, 2 for external MDIO bus
-
-The following compatibles require a PHY node representing the integrated
-PHY, under the integrated MDIO bus node if an mdio-mux node is used:
-  - "allwinner,sun8i-h3-emac",
-  - "allwinner,sun8i-v3s-emac":
-
-Additional information regarding generic multiplexer properties can be found
-at Documentation/devicetree/bindings/net/mdio-mux.txt
-
-Required properties of the integrated phy node:
-- clocks: a phandle to the reference clock for the EPHY
-- resets: a phandle to the reset control for the EPHY
-- Must be a child of the integrated mdio
-
-Example with integrated PHY:
-emac: ethernet@1c0b000 {
-   compatible = "allwinner,sun8i-h3-emac";
-   syscon = <&syscon>;
-   reg = <0x01c0b000 0x104>;
-   interrupts = ;
-   interrupt-names = "macirq";
-   resets = <&ccu RST_BUS_EMAC>;
-   reset-names = "stmmaceth";
-   clocks = <&ccu CLK_BUS_EMAC>;
-   clock-names = "stmmaceth";
-
-   phy-handle = <&int_mii_phy>;
-   phy-mode = "mii";
-   allwinner,leds-active-low;
-
-   mdio: mdio {
-   #address-cells = <1>;
-   #size-cells = <0>;
-   compatible = "snps,dwmac-mdio";
-   };
-
-   mdio-mux {
-   compatible = "mdio-mux", "allwinner,sun8i-h3-mdio-mux";
-   #address-cells = <1>;
-   #size-cells = <0>;
-
-   mdio-parent-bus = <&mdio>;
-
-   int_mdio: mdio@1 {
-   compatible = "allwinner,sun8i-h3-mdio-internal";
-   reg = <1>;
-

[PATCH 7/8] dt-bindings: net: sun7i-gmac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Switch our Allwinner A20 GMAC controller binding to a YAML schema to enable
the DT validation. Since that controller is based on a Synopsys IP, let's
add the validation to that schemas with a bunch of conditionals.

Signed-off-by: Maxime Ripard 
---
 Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt | 27 
---
 Documentation/devicetree/bindings/net/snps,dwmac.yaml  | 45 
+
 2 files changed, 45 insertions(+), 27 deletions(-)
 delete mode 100644 
Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt

diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt 
b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
deleted file mode 100644
index 8b3f953656e3..
--- a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
- - compatible:  Should be "allwinner,sun7i-a20-gmac"
- - clocks: Should contain the GMAC main clock, and tx clock
-   The tx clock type should be "allwinner,sun7i-a20-gmac-clk"
- - clock-names: Should contain the clock names "stmmaceth",
-   and "allwinner_gmac_tx"
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Examples:
-
-   gmac: ethernet@1c5 {
-   compatible = "allwinner,sun7i-a20-gmac";
-   reg = <0x01c5 0x1>,
- <0x01c20164 0x4>;
-   interrupts = <0 85 1>;
-   interrupt-names = "macirq";
-   clocks = <&ahb_gates 49>, <&gmac_tx>;
-   clock-names = "stmmaceth", "allwinner_gmac_tx";
-   phy-mode = "mii";
-   };
diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml 
b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
index be3ada5121e1..d213c32ef153 100644
--- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -14,6 +14,7 @@ maintainers:
 properties:
   compatible:
 oneOf:
+  - const: allwinner,sun7i-a20-gmac
   - const: snps,dwmac
   - const: snps,dwmac-3.50a
   - const: snps,dwmac-3.610
@@ -232,6 +233,7 @@ allOf:
   properties:
 compatible:
   enum:
+- allwinner,sun7i-a20-gmac
 - snps,dwxgmac
 - snps,dwxgmac-2.10
 - st,spear600-gmac
@@ -273,6 +275,37 @@ allOf:
 Enables the TSO feature otherwise it will be managed by
 MAC HW capability register. Only for GMAC4 and newer.
 
+  - if:
+  properties:
+compatible:
+  const: allwinner,sun7i-a20-gmac
+
+then:
+  properties:
+interrupts:
+  maxItems: 1
+
+interrupt-names:
+  const: macirq
+
+clocks:
+  items:
+- description: GMAC main clock
+- description: TX clock
+
+clock-names:
+  items:
+- const: stmmaceth
+- const: allwinner_gmac_tx
+
+phy-supply:
+  description:
+PHY regulator
+
+  required:
+- clocks
+- clock-names
+
 examples:
   - |
 stmmac_axi_setup: stmmac-axi-config {
@@ -337,6 +370,18 @@ examples:
 };
 };
 
+  - |
+gmac: ethernet@1c5 {
+compatible = "allwinner,sun7i-a20-gmac";
+reg = <0x01c5 0x1>,
+  <0x01c20164 0x4>;
+interrupts = <0 85 1>;
+interrupt-names = "macirq";
+clocks = <&ahb_gates 49>, <&gmac_tx>;
+clock-names = "stmmaceth", "allwinner_gmac_tx";
+phy-mode = "mii";
+};
+
 # FIXME: We should set it, but it would report all the generic
 # properties as additional properties.
 # additionalProperties: false
-- 
git-series 0.9.1

RE: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Jose Abreu

From: Maxime Ripard 
Date: Thu, May 23, 2019 at 10:56:49

> Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML
> schema to enable the DT validation.
> 
> Signed-off-by: Maxime Ripard 

How exactly can I see the final results of this ? Do you have any link ? 
(I'm no expert in YAML at all).

Thanks,
Jose Miguel Abreu

[PATCH bpf-next v2 0/3] tools: bpftool: add an option for debug output from libbpf and verifier

2019-05-23 Thread Quentin Monnet

Hi,
This series adds an option to bpftool to make it print additional
information via libbpf and the kernel verifier when attempting to load
programs.

A new API function is added to libbpf in order to pass the log_level from
bpftool with the bpf_object__* part of the API.

Options for a finer control over the log levels to use for libbpf and the
verifier could be added in the future, if desired.

v2:
- Do not add distinct options for libbpf and verifier logs, just keep the
  one that sets all log levels to their maximum. Rename the option.
- Do not offer a way to pick desired log levels. The choice is "use the
  option to print all logs" or "stick with the defaults".
- Do not export BPF_LOG_* flags to user header.
- Update all man pages (most bpftool operations use libbpf and may print
  libbpf logs). Verifier logs are only used when attempting to load
  programs for now, so bpftool-prog.rst and bpftool.rst remain the only
  pages updated in that regard.

Previous discussion available at:
https://lore.kernel.org/bpf/20190429095227.9745-1-quentin.mon...@netronome.com/

Quentin Monnet (3):
  tools: bpftool: add -d option to get debug output from libbpf
  libbpf: add bpf_object__load_xattr() API function to pass log_level
  tools: bpftool: make -d option print debug output from verifier

 .../bpf/bpftool/Documentation/bpftool-btf.rst |  4 +++
 .../bpftool/Documentation/bpftool-cgroup.rst  |  4 +++
 .../bpftool/Documentation/bpftool-feature.rst |  4 +++
 .../bpf/bpftool/Documentation/bpftool-map.rst |  4 +++
 .../bpf/bpftool/Documentation/bpftool-net.rst |  4 +++
 .../bpftool/Documentation/bpftool-perf.rst|  4 +++
 .../bpftool/Documentation/bpftool-prog.rst|  5 
 tools/bpf/bpftool/Documentation/bpftool.rst   |  4 +++
 tools/bpf/bpftool/bash-completion/bpftool |  2 +-
 tools/bpf/bpftool/main.c  | 16 ++-
 tools/bpf/bpftool/main.h  |  1 +
 tools/bpf/bpftool/prog.c  | 27 ---
 tools/lib/bpf/Makefile|  2 +-
 tools/lib/bpf/libbpf.c| 20 +++---
 tools/lib/bpf/libbpf.h|  6 +
 tools/lib/bpf/libbpf.map  |  5 
 16 files changed, 96 insertions(+), 16 deletions(-)

-- 
2.17.1

[PATCH bpf-next v2 1/3] tools: bpftool: add -d option to get debug output from libbpf

2019-05-23 Thread Quentin Monnet

libbpf has three levels of priority for output messages: warn, info,
debug. By default, debug output is not printed to the console.

Add a new "--debug" (short name: "-d") option to bpftool to print libbpf
logs for all three levels.

Internally, we simply use the function provided by libbpf to replace the
default printing function by one that prints logs regardless of their
level.

v2:
- Remove the possibility to select the log-levels to use (v1 offered a
  combination of "warn", "info" and "debug").
- Rename option and offer a short name: -d|--debug.
- Add option description to all bpftool manual pages (instead of
  bpftool-prog.rst only), as all commands use libbpf.

Signed-off-by: Quentin Monnet 
Reviewed-by: Jakub Kicinski 
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst|  4 
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |  4 
 .../bpf/bpftool/Documentation/bpftool-feature.rst  |  4 
 tools/bpf/bpftool/Documentation/bpftool-map.rst|  4 
 tools/bpf/bpftool/Documentation/bpftool-net.rst|  4 
 tools/bpf/bpftool/Documentation/bpftool-perf.rst   |  4 
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   |  4 
 tools/bpf/bpftool/Documentation/bpftool.rst|  3 +++
 tools/bpf/bpftool/bash-completion/bpftool  |  2 +-
 tools/bpf/bpftool/main.c   | 14 +-
 10 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 2dbc1413fabd..00668df1bf7a 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -67,6 +67,10 @@ OPTIONS
-p, --pretty
  Generate human-readable JSON output. Implies **-j**.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 EXAMPLES
 
 **# bpftool btf dump id 1226**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst 
b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index ac26876389c2..36807735e2a5 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -113,6 +113,10 @@ OPTIONS
-f, --bpffs
  Show file names of pinned programs.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 EXAMPLES
 
 |
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst 
b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index 14180e887082..4d08f35034a2 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -73,6 +73,10 @@ OPTIONS
-p, --pretty
  Generate human-readable JSON output. Implies **-j**.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 SEE ALSO
 
**bpf**\ (2),
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst 
b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 13ef27b39f20..490b4501cb6e 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -152,6 +152,10 @@ OPTIONS
  Do not automatically attempt to mount any virtual file system
  (such as tracefs or BPF virtual file system) when necessary.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 EXAMPLES
 
 **# bpftool map show**
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst 
b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 934580850f42..d8e5237a2085 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -65,6 +65,10 @@ OPTIONS
-p, --pretty
  Generate human-readable JSON output. Implies **-j**.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 EXAMPLES
 
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index 0c7576523a21..e252bd0bc434 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -53,6 +53,10 @@ OPTIONS
-p, --pretty
  Generate human-readable JSON output. Implies **-j**.
 
+   -d, --debug
+ Print all logs available from libbpf, including debug-level
+ information.
+
 EXAMPLES
 
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index e8118544d118..9a92614569e6 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/

[PATCH bpf-next v2 3/3] tools: bpftool: make -d option print debug output from verifier

2019-05-23 Thread Quentin Monnet

The "-d" option is used to require all logs available for bpftool. So
far it meant telling libbpf to print even debug-level information. But
there is another source of info that can be made more verbose: when we
attemt to load programs with bpftool, we can pass a log_level parameter
to the verifier in order to control the amount of information that is
printed to the console.

Reuse the "-d" option to print all information the verifier can tell. At
this time, this means logs related to BPF_LOG_LEVEL1, BPF_LOG_LEVEL2 and
BPF_LOG_STATS. As mentioned in the discussion on the first version of
this set, these macros are internal to the kernel
(include/linux/bpf_verifier.h) and are not meant to be part of the
stable user API, therefore we simply use the related constants to print
whatever we can at this time, without trying to tell users what is
log_level1 or what is statistics.

Verifier logs are only used when loading programs for now (in the
future: for loading BTF objects with bpftool?), so bpftool.rst and
bpftool-prog.rst are the only man pages to get the update.

v2:
- Remove the possibility to select the log levels to use (v1 offered a
  combination of "log_level1", "log_level2" and "stats").
- The macros from kernel header bpf_verifier.h are not used (and
  therefore not moved to UAPI header).
- In v1 this was a distinct option, but is now merged in the only "-d"
  switch to activate libbpf and verifier debug-level logs all at the
  same time.

Signed-off-by: Quentin Monnet 
Reviewed-by: Jakub Kicinski 
---
 .../bpftool/Documentation/bpftool-prog.rst|  5 ++--
 tools/bpf/bpftool/Documentation/bpftool.rst   |  5 ++--
 tools/bpf/bpftool/main.c  |  2 ++
 tools/bpf/bpftool/main.h  |  1 +
 tools/bpf/bpftool/prog.c  | 27 ---
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 9a92614569e6..228a5c863cc7 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -175,8 +175,9 @@ OPTIONS
  (such as tracefs or BPF virtual file system) when necessary.
 
-d, --debug
- Print all logs available from libbpf, including debug-level
- information.
+ Print all logs available, even debug-level information. This
+ includes logs from libbpf as well as from the verifier, when
+ attempting to load programs.
 
 EXAMPLES
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst 
b/tools/bpf/bpftool/Documentation/bpftool.rst
index 43dba0717953..6a9c52ef84a9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -67,8 +67,9 @@ OPTIONS
  (such as tracefs or BPF virtual file system) when necessary.
 
-d, --debug
- Print all logs available from libbpf, including debug-level
- information.
+ Print all logs available, even debug-level information. This
+ includes logs from libbpf as well as from the verifier, when
+ attempting to load programs.
 
 SEE ALSO
 
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index d74293938a05..4879f6395c7e 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -26,6 +26,7 @@ bool pretty_output;
 bool json_output;
 bool show_pinned;
 bool block_mount;
+bool verifier_logs;
 int bpf_flags;
 struct pinned_obj_table prog_table;
 struct pinned_obj_table map_table;
@@ -373,6 +374,7 @@ int main(int argc, char **argv)
break;
case 'd':
libbpf_set_print(print_all_levels);
+   verifier_logs = true;
break;
default:
p_err("unrecognized option '%s'", argv[optind - 1]);
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 3d63feb7f852..28a2a5857e14 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -91,6 +91,7 @@ extern json_writer_t *json_wtr;
 extern bool json_output;
 extern bool show_pinned;
 extern bool block_mount;
+extern bool verifier_logs;
 extern int bpf_flags;
 extern struct pinned_obj_table prog_table;
 extern struct pinned_obj_table map_table;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 26336bad0442..1f209c80d906 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -750,10 +750,11 @@ static int do_detach(int argc, char **argv)
 
 static int load_with_options(int argc, char **argv, bool first_prog_only)
 {
-   enum bpf_attach_type expected_attach_type;
-   struct bpf_object_open_attr attr = {
-   .prog_type  = BPF_PROG_TYPE_UNSPEC,
+   struct bpf_object_load_attr load_attr = { 0 };
+

[PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level

2019-05-23 Thread Quentin Monnet

libbpf was recently made aware of the log_level attribute for programs,
used to specify the level of information expected to be dumped by the
verifier.

Create an API function to pass additional attributes when loading a
bpf_object, so we can set this log_level value in programs when loading
them, and so that so that applications relying on libbpf but not calling
bpf_prog_load_xattr() can also use that feature.

v2:
- We are in a new cycle, bump libbpf extraversion number.

Signed-off-by: Quentin Monnet 
Reviewed-by: Jakub Kicinski 
---
 tools/lib/bpf/Makefile   |  2 +-
 tools/lib/bpf/libbpf.c   | 20 +---
 tools/lib/bpf/libbpf.h   |  6 ++
 tools/lib/bpf/libbpf.map |  5 +
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index a2aceadf68db..9312066a1ae3 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -3,7 +3,7 @@
 
 BPF_VERSION = 0
 BPF_PATCHLEVEL = 0
-BPF_EXTRAVERSION = 3
+BPF_EXTRAVERSION = 4
 
 MAKEFLAGS += --no-print-directory
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 197b574406b3..1c6fb7a3201e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -,7 +,7 @@ static bool bpf_program__is_function_storage(struct 
bpf_program *prog,
 }
 
 static int
-bpf_object__load_progs(struct bpf_object *obj)
+bpf_object__load_progs(struct bpf_object *obj, int log_level)
 {
size_t i;
int err;
@@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj)
for (i = 0; i < obj->nr_programs; i++) {
if (bpf_program__is_function_storage(&obj->programs[i], obj))
continue;
+   obj->programs[i].log_level = log_level;
err = bpf_program__load(&obj->programs[i],
obj->license,
obj->kern_version);
@@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj)
return 0;
 }
 
-int bpf_object__load(struct bpf_object *obj)
+int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 {
+   struct bpf_object *obj;
int err;
 
+   if (!attr)
+   return -EINVAL;
+   obj = attr->obj;
if (!obj)
return -EINVAL;
 
@@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj)
 
CHECK_ERR(bpf_object__create_maps(obj), err, out);
CHECK_ERR(bpf_object__relocate(obj), err, out);
-   CHECK_ERR(bpf_object__load_progs(obj), err, out);
+   CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
 
return 0;
 out:
@@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj)
return err;
 }
 
+int bpf_object__load(struct bpf_object *obj)
+{
+   struct bpf_object_load_attr attr = {
+   .obj = obj,
+   };
+
+   return bpf_object__load_xattr(&attr);
+}
+
 static int check_path(const char *path)
 {
char *cp, errmsg[STRERR_BUFSIZE];
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c5ff00515ce7..e1c748db44f6 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct bpf_object 
*obj,
 LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path);
 LIBBPF_API void bpf_object__close(struct bpf_object *object);
 
+struct bpf_object_load_attr {
+   struct bpf_object *obj;
+   int log_level;
+};
+
 /* Load/unload object into/from kernel */
 LIBBPF_API int bpf_object__load(struct bpf_object *obj);
+LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr);
 LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
 LIBBPF_API const char *bpf_object__name(struct bpf_object *obj);
 LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 673001787cba..6ce61fa0baf3 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -164,3 +164,8 @@ LIBBPF_0.0.3 {
bpf_map_freeze;
btf__finalize_data;
 } LIBBPF_0.0.2;
+
+LIBBPF_0.0.4 {
+   global:
+   bpf_object__load_xattr;
+} LIBBPF_0.0.3;
-- 
2.17.1

[PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue

2019-05-23 Thread Toshiaki Makita

XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to
the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the
heavy cost of indirect call but it also reduces lock acquisition on the
destination device that needs locks like veth and tun.

XDP_TX does not use indirect calls but drivers which require locks can
benefit from the bulk transmit for XDP_TX as well.

This patch adds per-cpu queues which can be used for bulk transmit on
XDP_TX. I did not add functions like enqueue/flush but exposed the queue
directly because we should avoid indirect calls on XDP_TX.

Note that the queue must be flushed, i.e. "count" member needs to be
set to 0, when a NAPI handler which used this queue exits. Otherwise
packets left in the queue will be transmitted from totally unintentional
devices.

Signed-off-by: Toshiaki Makita 
---
 include/net/xdp.h | 7 +++
 net/core/xdp.c| 3 +++
 2 files changed, 10 insertions(+)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0f25b36..30b36c8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -84,6 +84,13 @@ struct xdp_frame {
struct net_device *dev_rx; /* used by cpumap */
 };
 
+#define XDP_TX_BULK_SIZE   16
+struct xdp_tx_bulk_queue {
+   struct xdp_frame *q[XDP_TX_BULK_SIZE];
+   unsigned int count;
+};
+DECLARE_PER_CPU(struct xdp_tx_bulk_queue, xdp_tx_bq);
+
 /* Clear kernel pointers in xdp_frame */
 static inline void xdp_scrub_frame(struct xdp_frame *frame)
 {
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4b2b194..0622f2d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -40,6 +40,9 @@ struct xdp_mem_allocator {
struct rcu_head rcu;
 };
 
+DEFINE_PER_CPU(struct xdp_tx_bulk_queue, xdp_tx_bq);
+EXPORT_PER_CPU_SYMBOL_GPL(xdp_tx_bq);
+
 static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
 {
const u32 *k = data;
-- 
1.8.3.1

[PATCH bpf-next 0/3] veth: Bulk XDP_TX

2019-05-23 Thread Toshiaki Makita

This adds an infrastructure for bulk XDP_TX and makes veth use it.
Improves XDP_TX performance by approximately 8%. The detailed
performance numbers are shown in patch 3.

Signed-off-by: Toshiaki Makita 

Toshiaki Makita (3):
  xdp: Add bulk XDP_TX queue
  xdp: Add tracepoint for bulk XDP_TX
  veth: Support bulk XDP_TX

 drivers/net/veth.c | 26 +-
 include/net/xdp.h  |  7 +++
 include/trace/events/xdp.h | 25 +
 kernel/bpf/core.c  |  1 +
 net/core/xdp.c |  3 +++
 5 files changed, 61 insertions(+), 1 deletion(-)

-- 
1.8.3.1

[PATCH bpf-next 2/3] xdp: Add tracepoint for bulk XDP_TX

2019-05-23 Thread Toshiaki Makita

This is introduced for admins to check what is happening on XDP_TX when
bulk XDP_TX is in use.

Signed-off-by: Toshiaki Makita 
---
 include/trace/events/xdp.h | 25 +
 kernel/bpf/core.c  |  1 +
 2 files changed, 26 insertions(+)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index e95cb86..e06ea65 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -50,6 +50,31 @@
  __entry->ifindex)
 );
 
+TRACE_EVENT(xdp_bulk_tx,
+
+   TP_PROTO(const struct net_device *dev,
+int sent, int drops, int err),
+
+   TP_ARGS(dev, sent, drops, err),
+
+   TP_STRUCT__entry(
+   __field(int, ifindex)
+   __field(int, drops)
+   __field(int, sent)
+   __field(int, err)
+   ),
+
+   TP_fast_assign(
+   __entry->ifindex= dev->ifindex;
+   __entry->drops  = drops;
+   __entry->sent   = sent;
+   __entry->err= err;
+   ),
+
+   TP_printk("ifindex=%d sent=%d drops=%d err=%d",
+ __entry->ifindex, __entry->sent, __entry->drops, __entry->err)
+);
+
 DECLARE_EVENT_CLASS(xdp_redirect_template,
 
TP_PROTO(const struct net_device *dev,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643..7687488 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2108,3 +2108,4 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int 
offset, void *to,
 #include 
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
-- 
1.8.3.1

[PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toshiaki Makita

This improves XDP_TX performance by about 8%.

Here are single core XDP_TX test results. CPU consumptions are taken
from "perf report --no-child".

- Before:

  7.26 Mpps

  _raw_spin_lock  7.83%
  veth_xdp_xmit  12.23%

- After:

  7.84 Mpps

  _raw_spin_lock  1.17%
  veth_xdp_xmit   6.45%

Signed-off-by: Toshiaki Makita 
---
 drivers/net/veth.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 52110e5..4edc75f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
return ret;
 }
 
+static void veth_xdp_flush_bq(struct net_device *dev)
+{
+   struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
+   int sent, i, err = 0;
+
+   sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
+   if (sent < 0) {
+   err = sent;
+   sent = 0;
+   for (i = 0; i < bq->count; i++)
+   xdp_return_frame(bq->q[i]);
+   }
+   trace_xdp_bulk_tx(dev, sent, bq->count - sent, err);
+
+   bq->count = 0;
+}
+
 static void veth_xdp_flush(struct net_device *dev)
 {
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
@@ -449,6 +466,7 @@ static void veth_xdp_flush(struct net_device *dev)
struct veth_rq *rq;
 
rcu_read_lock();
+   veth_xdp_flush_bq(dev);
rcv = rcu_dereference(priv->peer);
if (unlikely(!rcv))
goto out;
@@ -466,12 +484,18 @@ static void veth_xdp_flush(struct net_device *dev)
 
 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 {
+   struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
struct xdp_frame *frame = convert_to_xdp_frame(xdp);
 
if (unlikely(!frame))
return -EOVERFLOW;
 
-   return veth_xdp_xmit(dev, 1, &frame, 0);
+   if (unlikely(bq->count == XDP_TX_BULK_SIZE))
+   veth_xdp_flush_bq(dev);
+
+   bq->q[bq->count++] = frame;
+
+   return 0;
 }
 
 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
-- 
1.8.3.1

Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

Hi!

On Thu, May 23, 2019 at 10:11:39AM +, Jose Abreu wrote:
> From: Maxime Ripard 
> Date: Thu, May 23, 2019 at 10:56:49
>
> > Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML
> > schema to enable the DT validation.
> >
> > Signed-off-by: Maxime Ripard 
>
> How exactly can I see the final results of this ? Do you have any link ?
> (I'm no expert in YAML at all).

You need some extra tooling, that you can find here:
https://github.com/devicetree-org/dt-schema

You can then run make dtbs_check, and those YAML files will be used to
validate that any devicetree using those properties are doing it
properly. That implies having the right node names, properties, types,
ranges of values when relevant, and so on.

Maxime

--
Maxime Ripard, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue

2019-05-23 Thread Toke Høiland-Jørgensen

Toshiaki Makita  writes:

> XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to
> the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the
> heavy cost of indirect call but it also reduces lock acquisition on the
> destination device that needs locks like veth and tun.
>
> XDP_TX does not use indirect calls but drivers which require locks can
> benefit from the bulk transmit for XDP_TX as well.

XDP_TX happens on the same device, so there's an implicit bulking
happening because of the NAPI cycle. So why is an additional mechanism
needed (in the general case)?

-Toke

RE: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Jose Abreu

From: Maxime Ripard 
Date: Thu, May 23, 2019 at 12:07:15

> You can then run make dtbs_check, and those YAML files will be used to
> validate that any devicetree using those properties are doing it
> properly. That implies having the right node names, properties, types,
> ranges of values when relevant, and so on.

Thanks but how can one that's developing know which bindings it shall use 
? Is this not parsed/prettified and displayed in some kind of webpage ?

Just that now that the TXT is gone its kind of "strange" to look at YAML 
instead of plain text and develop/use the bindings.

Thanks,
Jose Miguel Abreu

Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue

2019-05-23 Thread Toshiaki Makita

On 2019/05/23 20:11, Toke Høiland-Jørgensen wrote:
> Toshiaki Makita  writes:
> 
>> XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to
>> the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the
>> heavy cost of indirect call but it also reduces lock acquisition on the
>> destination device that needs locks like veth and tun.
>>
>> XDP_TX does not use indirect calls but drivers which require locks can
>> benefit from the bulk transmit for XDP_TX as well.
> 
> XDP_TX happens on the same device, so there's an implicit bulking
> happening because of the NAPI cycle. So why is an additional mechanism
> needed (in the general case)?

Not sure what the implicit bulking you mention is. XDP_TX calls
.ndo_xdp_xmit() for each packet, and it acquires a lock in veth and tun.
To avoid this, we need additional storage for bulking like devmap for
XDP_REDIRECT.

-- 
Toshiaki Makita

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toke Høiland-Jørgensen

Toshiaki Makita  writes:

> This improves XDP_TX performance by about 8%.
>
> Here are single core XDP_TX test results. CPU consumptions are taken
> from "perf report --no-child".
>
> - Before:
>
>   7.26 Mpps
>
>   _raw_spin_lock  7.83%
>   veth_xdp_xmit  12.23%
>
> - After:
>
>   7.84 Mpps
>
>   _raw_spin_lock  1.17%
>   veth_xdp_xmit   6.45%
>
> Signed-off-by: Toshiaki Makita 
> ---
>  drivers/net/veth.c | 26 +-
>  1 file changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 52110e5..4edc75f 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>   return ret;
>  }
>  
> +static void veth_xdp_flush_bq(struct net_device *dev)
> +{
> + struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
> + int sent, i, err = 0;
> +
> + sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);

Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
you're introducing an additional per-cpu bulk queue, only to avoid lock
contention around the existing pointer ring. But the pointer ring is
per-rq, so if you have lock contention, this means you must have
multiple CPUs servicing the same rq, no? So why not just fix that
instead?

-Toke

Re: [PATCH bpf-next 1/3] xdp: Add bulk XDP_TX queue

2019-05-23 Thread Toke Høiland-Jørgensen

Toshiaki Makita  writes:

> On 2019/05/23 20:11, Toke Høiland-Jørgensen wrote:
>> Toshiaki Makita  writes:
>> 
>>> XDP_TX is similar to XDP_REDIRECT as it essentially redirects packets to
>>> the device itself. XDP_REDIRECT has bulk transmit mechanism to avoid the
>>> heavy cost of indirect call but it also reduces lock acquisition on the
>>> destination device that needs locks like veth and tun.
>>>
>>> XDP_TX does not use indirect calls but drivers which require locks can
>>> benefit from the bulk transmit for XDP_TX as well.
>> 
>> XDP_TX happens on the same device, so there's an implicit bulking
>> happening because of the NAPI cycle. So why is an additional mechanism
>> needed (in the general case)?
>
> Not sure what the implicit bulking you mention is. XDP_TX calls
> .ndo_xdp_xmit() for each packet, and it acquires a lock in veth and
> tun. To avoid this, we need additional storage for bulking like devmap
> for XDP_REDIRECT.

The bulking is in veth_poll(), where veth_xdp_flush() is only called at
the end. But see my other reply to the veth.c patch for the lock
contention issue...

-Toke

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toshiaki Makita

On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote:
> Toshiaki Makita  writes:
> 
>> This improves XDP_TX performance by about 8%.
>>
>> Here are single core XDP_TX test results. CPU consumptions are taken
>> from "perf report --no-child".
>>
>> - Before:
>>
>>   7.26 Mpps
>>
>>   _raw_spin_lock  7.83%
>>   veth_xdp_xmit  12.23%
>>
>> - After:
>>
>>   7.84 Mpps
>>
>>   _raw_spin_lock  1.17%
>>   veth_xdp_xmit   6.45%
>>
>> Signed-off-by: Toshiaki Makita 
>> ---
>>  drivers/net/veth.c | 26 +-
>>  1 file changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>> index 52110e5..4edc75f 100644
>> --- a/drivers/net/veth.c
>> +++ b/drivers/net/veth.c
>> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>  return ret;
>>  }
>>  
>> +static void veth_xdp_flush_bq(struct net_device *dev)
>> +{
>> +struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
>> +int sent, i, err = 0;
>> +
>> +sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
> 
> Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
> you're introducing an additional per-cpu bulk queue, only to avoid lock
> contention around the existing pointer ring. But the pointer ring is
> per-rq, so if you have lock contention, this means you must have
> multiple CPUs servicing the same rq, no?

Yes, it's possible. Not recommended though.

> So why not just fix that
> instead?

The queues are shared with packets from stack sent from peer. That's
because I needed the lock. I have tried to separate the queues, one for
redirect and one for stack, but receiver side got too complicated and it
ended up with worse performance.

-- 
Toshiaki Makita

RE: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw

2019-05-23 Thread Ioana Ciornei


> Subject: Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw
> 
> 
> 
> On 5/22/2019 7:25 PM, Florian Fainelli wrote:
> >
> >
> > On 5/22/2019 6:20 PM, Ioana Ciornei wrote:
> >> This adds a new entry point to PHYLINK that does not require a
> >> net_device structure.
> >>
> >> The main intended use are DSA ports that do not have net devices
> >> registered for them (mainly because doing so would be redundant - see
> >> Documentation/networking/dsa/dsa.rst for details). So far DSA has
> >> been using PHYLIB fixed PHYs for these ports, driven manually with
> >> genphy instead of starting a full PHY state machine, but this does
> >> not scale well when there are actual PHYs that need a driver on those
> >> ports, or when a fixed-link is requested in DT that has a speed
> >> unsupported by the fixed PHY C22 emulation (such as SGMII-2500).
> >>
> >> The proposed solution comes in the form of a notifier chain owned by
> >> the PHYLINK instance, and the passing of phylink_notifier_info
> >> structures back to the driver through a blocking notifier call.
> >>
> >> The event API exposed by the new notifier mechanism is a 1:1 mapping
> >> to the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback.
> >>
> >> Both the standard phylink_create() function, as well as its raw
> >> variant, call the same underlying function which initializes either
> >> the netdev field or the notifier block of the PHYLINK instance.
> >>
> >> All PHYLINK driver callbacks have been extended to call the notifier
> >> chain in case the instance is a raw one.
> >>
> >> Signed-off-by: Ioana Ciornei 
> >> Signed-off-by: Vladimir Oltean 
> >> ---
> >
> > [snip]
> >
> >> +  struct phylink_notifier_info info = {
> >> +  .link_an_mode = pl->link_an_mode,
> >> +  /* Discard const pointer */
> >> +  .state = (struct phylink_link_state *)state,
> >> +  };
> >> +
> >>netdev_dbg(pl->netdev,
> >>   "%s: mode=%s/%s/%s/%s adv=%*pb pause=%02x link=%u
> an=%u\n",
> >>   __func__, phylink_an_mode_str(pl->link_an_mode),
> >> @@ -299,7 +317,12 @@ static void phylink_mac_config(struct phylink *pl,
> >>   __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
> >>   state->pause, state->link, state->an_enabled);
> >
> > Don't you need to guard that netdev_dbg() with an if (pl->ops) to
> > avoid de-referencing a NULL net_device?
> >


The netdev_* print will not dereference a NULL net_device since it has explicit 
checks agains this.
Instead it will just print (net/core/dev.c, __netdev_printk):

printk("%s(NULL net_device): %pV", level, vaf);


> > Another possibility could be to change the signature of the
> > phylink_mac_ops to take an opaque pointer and in the case where we
> > called phylink_create() and passed down a net_device pointer, we
> > somehow remember that for doing any operation that requires a
> > net_device (printing, setting carrier). We lose strict typing in doing
> > that, but we'd have fewer places to patch for a blocking notifier call.
> >
> 
> Or even make those functions part of phylink_mac_ops such that the caller
> could pass an .carrier_ok callback which is netif_carrier_ok() for a 
> net_device,
> else it's NULL, same with printing functions if desired...
> --
> Florian


Let me see if I understood this correctly. I presume that any API that we add 
should not break any current PHYLINK users.

You suggest to change the prototype of the phylink_mac_ops from

void (*validate)(struct net_device *ndev, unsigned long *supported,
 struct phylink_link_state *state);

to something that takes a void pointer:

void (*validate)(void *dev, unsigned long *supported,
 struct phylink_link_state *state);

This would imply that the any function in PHYLINK would have to somehow 
differentiate if the dev provided is indeed a net_device or another structure 
in order to make the decision if netif_carrier_off should be called or not 
(this is so we do not break any drivers using PHYLINK). I cannot see how this 
judgement can be made.

--
Ioana

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toke Høiland-Jørgensen

Toshiaki Makita  writes:

> On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote:
>> Toshiaki Makita  writes:
>> 
>>> This improves XDP_TX performance by about 8%.
>>>
>>> Here are single core XDP_TX test results. CPU consumptions are taken
>>> from "perf report --no-child".
>>>
>>> - Before:
>>>
>>>   7.26 Mpps
>>>
>>>   _raw_spin_lock  7.83%
>>>   veth_xdp_xmit  12.23%
>>>
>>> - After:
>>>
>>>   7.84 Mpps
>>>
>>>   _raw_spin_lock  1.17%
>>>   veth_xdp_xmit   6.45%
>>>
>>> Signed-off-by: Toshiaki Makita 
>>> ---
>>>  drivers/net/veth.c | 26 +-
>>>  1 file changed, 25 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>>> index 52110e5..4edc75f 100644
>>> --- a/drivers/net/veth.c
>>> +++ b/drivers/net/veth.c
>>> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>> return ret;
>>>  }
>>>  
>>> +static void veth_xdp_flush_bq(struct net_device *dev)
>>> +{
>>> +   struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
>>> +   int sent, i, err = 0;
>>> +
>>> +   sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
>> 
>> Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
>> you're introducing an additional per-cpu bulk queue, only to avoid lock
>> contention around the existing pointer ring. But the pointer ring is
>> per-rq, so if you have lock contention, this means you must have
>> multiple CPUs servicing the same rq, no?
>
> Yes, it's possible. Not recommended though.
>
>> So why not just fix that instead?
>
> The queues are shared with packets from stack sent from peer. That's
> because I needed the lock. I have tried to separate the queues, one for
> redirect and one for stack, but receiver side got too complicated and it
> ended up with worse performance.

I meant fix it with configuration. Now many receive queues are you
running on the veth device in your benchmarks, and how have you
configured the RPS?

-Toke

Re: DSA setup IMX6ULL and Marvell 88E6390 with 2 Ethernet Phys - CPU Port is not working

2019-05-23 Thread Andrew Lunn

> thanks for your reply. You're right, both PHYs are 10/100.
> 
> I already added a fixed-link like this:
> 
>   port@0 {
>   reg = <0>;
>   label = "cpu";
>   ethernet = <&fec1>;
>   phy-mode = "rmii";
>   phy-handle = <&switch0phy0>;
> fixed-link {
> speed = <100>;
> full-duplex;
> };
>   };
> 
> I hope you mean that with fixed-phy? But this doesn't changed anything.

You probably have multiple issues, and it is not going to work until
you have them all solved.

You can get access to the registers etc, using patches from:

https://github.com/vivien/linux.git dsa/debugfs

I've only seen the external MDIO bus on the 6390 used for C45 PHYs. So
there is a chance the driver code for C22 is broken?

  Andrew

Re: [PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options

2019-05-23 Thread Rob Herring

On Thu, May 23, 2019 at 4:57 AM Maxime Ripard  wrote:
>
> The Ethernet controllers have a good number of generic options that can be
> needed in a device tree. Add a YAML schemas for those.
>
> Signed-off-by: Maxime Ripard 
> ---
>  Documentation/devicetree/bindings/net/ethernet-controller.yaml | 197 +++-
>  Documentation/devicetree/bindings/net/ethernet.txt |  68 +--
>  Documentation/devicetree/bindings/net/fixed-link.txt   |  55 +--
>  3 files changed, 199 insertions(+), 121 deletions(-)
>  create mode 100644 
> Documentation/devicetree/bindings/net/ethernet-controller.yaml
>
> diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml 
> b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
> new file mode 100644
> index ..1c6e9e755481
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
> @@ -0,0 +1,197 @@
> +# SPDX-License-Identifier: GPL-2.0
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/ethernet-controller.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Ethernet Controller Generic Binding
> +
> +maintainers:
> +  - David S. Miller 
> +
> +properties:
> +  $nodename:
> +pattern: "^ethernet(@.*)?$"
> +
> +  local-mac-address:
> +allOf:
> +  - $ref: /schemas/types.yaml#definitions/uint8-array
> +  - minItems: 6
> +maxItems: 6
> +description:
> +  Specifies the MAC address that was assigned to the network device.
> +
> +  mac-address:
> +allOf:
> +  - $ref: /schemas/types.yaml#definitions/uint8-array
> +  - minItems: 6
> +maxItems: 6
> +description:
> +  Specifies the MAC address that was last used by the boot
> +  program; should be used in cases where the MAC address assigned
> +  to the device by the boot program is different from the
> +  local-mac-address property.
> +
> +  max-frame-size:
> +$ref: /schemas/types.yaml#definitions/uint32
> +description:
> +  Maximum transfer unit (IEEE defined MTU), rather than the
> +  maximum frame size (there\'s contradiction in the Devicetree
> +  Specification).
> +
> +  max-speed:
> +$ref: /schemas/types.yaml#definitions/uint32
> +description:
> +  Specifies maximum speed in Mbit/s supported by the device.
> +
> +  nvmem-cells:
> +maxItems: 1
> +description:
> +  Reference to an nvmem node for the MAC address
> +
> +  nvmem-cells-names:
> +const: mac-address
> +
> +  phy-connection-type:
> +description:
> +  Operation mode of the PHY interface
> +oneOf:

While yes, this lets us have descriptions, oneOf makes for poor error
messages. I'd just make the descriptions comments instead.

> +  - const: internal
> +description:
> +  there is not a standard bus between the MAC and the PHY,
> +  something proprietary is being used to embed the PHY in the
> +  MAC.
> +  - const: mii
> +  - const: gmii
> +  - const: sgmii
> +  - const: qsgmii
> +  - const: tbi
> +  - const: rev-mii
> +  - const: rmii
> +  - const: rgmii
> +description:
> +  RX and TX delays are added by the MAC when required
> +  - const: rgmii-id
> +description:
> +  RGMII with internal RX and TX delays provided by the PHY,
> +  the MAC should not add the RX or TX delays in this case
> +  - const: rgmii-rxid
> +description:
> +  RGMII with internal RX delay provided by the PHY, the MAC
> +  should not add an RX delay in this case
> +  - const: rgmii-txid
> +description:
> +  RGMII with internal TX delay provided by the PHY, the MAC
> +  should not add an TX delay in this case
> +  - const: rtbi
> +  - const: smii
> +  - const: xgmii
> +  - const: trgmii
> +  - const: 1000base-x
> +  - const: 2500base-x
> +  - const: rxaui
> +  - const: xaui
> +  - const: 10gbase-kr
> +description:
> +  10GBASE-KR, XFI, SFI
> +
> +  phy-mode:
> +$ref: "#/properties/phy-connection-type"
> +description:
> +  Deprecated in favor of phy-connection-type
> +
> +  phy-handle:
> +$ref: /schemas/types.yaml#definitions/phandle
> +description:
> +  Specifies a reference to a node representing a PHY device.
> +
> +  phy:
> +$ref: "#/properties/phy-handle"
> +description:
> +  Deprecated in favor of phy-handle
> +
> +  phy-device:
> +$ref: "#/properties/phy-handle"
> +description:
> +  Deprecated in favor of phy-handle
> +
> +  rx-fifo-depth:
> +$ref: /schemas/types.yaml#definitions/uint32
> +description:
> +  The size of the controller\'s receive fifo in bytes. This is used
> +  for components that can have configurable receive fifo sizes,
> +  and is useful for determining certain configuration settings
> +  such as flow control thresholds.
> +
> +  tx-fifo-depth:
> +$ref:

Re: [PATCH bpf-next 2/3] xdp: Add tracepoint for bulk XDP_TX

2019-05-23 Thread Jesper Dangaard Brouer

On Thu, 23 May 2019 19:56:47 +0900
Toshiaki Makita  wrote:

> This is introduced for admins to check what is happening on XDP_TX when
> bulk XDP_TX is in use.
> 
> Signed-off-by: Toshiaki Makita 
> ---
>  include/trace/events/xdp.h | 25 +
>  kernel/bpf/core.c  |  1 +
>  2 files changed, 26 insertions(+)
> 
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index e95cb86..e06ea65 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -50,6 +50,31 @@
> __entry->ifindex)
>  );
>  
> +TRACE_EVENT(xdp_bulk_tx,
> +

You are using this tracepoint like/instead of trace_xdp_devmap_xmit if
I understand correctly?  Or maybe the trace_xdp_redirect tracepoint.

The point is that is will be good if the tracepoints can share the
TP_STRUCT layout beginning, as it allows for attaching and reusing eBPF
code that is only interested in the top part of the struct.

I would also want to see some identifier, that trace programs can use
to group and corrolate these events, you do have ifindex, but most
other XDP tracepoints also have "prog_id".

> + TP_PROTO(const struct net_device *dev,
> +  int sent, int drops, int err),
> +
> + TP_ARGS(dev, sent, drops, err),
> +
> + TP_STRUCT__entry(
> + __field(int, ifindex)
> + __field(int, drops)
> + __field(int, sent)
> + __field(int, err)
> + ),

The xdp_redirect_template have:

TP_STRUCT__entry(
__field(int, prog_id)
__field(u32, act)
__field(int, ifindex)
__field(int, err)
__field(int, to_ifindex)
__field(u32, map_id)
__field(int, map_index)
),

And e.g. TRACE_EVENT xdp_exception have:

TP_STRUCT__entry(
__field(int, prog_id)
__field(u32, act)
__field(int, ifindex)
),


> +
> + TP_fast_assign(
> + __entry->ifindex= dev->ifindex;
> + __entry->drops  = drops;
> + __entry->sent   = sent;
> + __entry->err= err;
> + ),
> +
> + TP_printk("ifindex=%d sent=%d drops=%d err=%d",
> +   __entry->ifindex, __entry->sent, __entry->drops, __entry->err)
> +);
> +
>  DECLARE_EVENT_CLASS(xdp_redirect_template,
>  
>   TP_PROTO(const struct net_device *dev,
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 242a643..7687488 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2108,3 +2108,4 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int 
> offset, void *to,
>  #include 
>  
>  EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics

2019-05-23 Thread Jamal Hadi Salim


On 2019-05-22 6:20 p.m., Jakub Kicinski wrote:

On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote:

* removed RFC tags


Why?  There is still no upstream user for this (my previous
objections of this being only partially correct aside).



IIRC your point was to get the dumping to work with RTM_GETACTION.

That would still work here, no? There will be some latency
based on the frequency of hardware->kernel stats updates.

cheers,
jamal

[PATCH nf-next] netfilter: add support for matching IPv4 options

2019-05-23 Thread Stephen Suryaputra

This is the kernel change for the overall changes with this description:
Add capability to have rules matching IPv4 options. This is developed
mainly to support dropping of IP packets with loose and/or strict source
route route options. Nevertheless, the implementation include others and
ability to get specific fields in the option.

Signed-off-by: Stephen Suryaputra 
---
 include/net/inet_sock.h  |   2 +-
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/ipv4/ip_options.c|   2 +
 net/netfilter/nft_exthdr.c   | 136 +++
 4 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index e8eef85006aa..8db4f8639a33 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -55,7 +55,7 @@ struct ip_options {
ts_needaddr:1;
unsigned char   router_alert;
unsigned char   cipso;
-   unsigned char   __pad2;
+   unsigned char   end;
unsigned char   __data[0];
 };
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 061bb3eb20c3..81d31f4e4aa3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -730,10 +730,12 @@ enum nft_exthdr_flags {
  *
  * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers
  * @NFT_EXTHDR_OP_TCP: match against tcp options
+ * @NFT_EXTHDR_OP_IPV4: match against ipv4 options
  */
 enum nft_exthdr_op {
NFT_EXTHDR_OP_IPV6,
NFT_EXTHDR_OP_TCPOPT,
+   NFT_EXTHDR_OP_IPV4,
__NFT_EXTHDR_OP_MAX
 };
 #define NFT_EXTHDR_OP_MAX  (__NFT_EXTHDR_OP_MAX - 1)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 3db31bb9df50..fc0e694aa97c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -272,6 +272,7 @@ int __ip_options_compile(struct net *net,
for (l = opt->optlen; l > 0; ) {
switch (*optptr) {
case IPOPT_END:
+   opt->end = optptr - iph;
for (optptr++, l--; l > 0; optptr++, l--) {
if (*optptr != IPOPT_END) {
*optptr = IPOPT_END;
@@ -473,6 +474,7 @@ int __ip_options_compile(struct net *net,
*info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
 }
+EXPORT_SYMBOL(__ip_options_compile);
 
 int ip_options_compile(struct net *net,
   struct ip_options *opt, struct sk_buff *skb)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a940c9fd9045..c4d47d274bbe 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -62,6 +62,128 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr 
*expr,
regs->verdict.code = NFT_BREAK;
 }
 
+/* find the offset to specified option or the header beyond the options
+ * if target < 0.
+ *
+ * Note that *offset is used as input/output parameter, and if it is not zero,
+ * then it must be a valid offset to an inner IPv4 header. This can be used
+ * to explore inner IPv4 header, eg. ICMP error messages.
+ *
+ * If target header is found, its offset is set in *offset and return option
+ * number. Otherwise, return negative error.
+ *
+ * If the first fragment doesn't contain the End of Options it is considered
+ * invalid.
+ */
+static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned int 
*offset,
+int target, unsigned short *fragoff, int *flags)
+{
+   unsigned char optbuf[sizeof(struct ip_options) + 41];
+   struct ip_options *opt = (struct ip_options *)optbuf;
+   struct iphdr *iph, _iph;
+   unsigned int start;
+   __be32 info;
+   int optlen;
+   bool found = false;
+
+   if (fragoff)
+   *fragoff = 0;
+
+   if (!offset)
+   return -EINVAL;
+   if (!*offset)
+   *offset = skb_network_offset(skb);
+
+   iph = skb_header_pointer(skb, *offset, sizeof(_iph), &_iph);
+   if (!iph || iph->version != 4)
+   return -EBADMSG;
+   start = *offset + sizeof(struct iphdr);
+
+   optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
+   if (optlen <= 0)
+   return -ENOENT;
+
+   memset(opt, 0, sizeof(struct ip_options));
+   /* Copy the options since __ip_options_compile() modifies
+* the options. Get one byte beyond the option for target < 0
+*/
+   if (skb_copy_bits(skb, start, opt->__data, optlen + 1))
+   return -EBADMSG;
+   opt->optlen = optlen;
+
+   if (__ip_options_compile(net, opt, NULL, &info))
+   return -EBADMSG;
+
+   switch (target) {
+   case IPOPT_SSRR:
+   case IPOPT_LSRR:
+   if (opt->srr) {
+   found = target == IPOPT_SSRR ? opt->is_strictroute :
+  !opt->is_strictroute;
+

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Jesper Dangaard Brouer

On Thu, 23 May 2019 20:35:50 +0900
Toshiaki Makita  wrote:

> On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote:
> > Toshiaki Makita  writes:
> >   
> >> This improves XDP_TX performance by about 8%.
> >>
> >> Here are single core XDP_TX test results. CPU consumptions are taken
> >> from "perf report --no-child".
> >>
> >> - Before:
> >>
> >>   7.26 Mpps
> >>
> >>   _raw_spin_lock  7.83%
> >>   veth_xdp_xmit  12.23%
> >>
> >> - After:
> >>
> >>   7.84 Mpps
> >>
> >>   _raw_spin_lock  1.17%
> >>   veth_xdp_xmit   6.45%
> >>
> >> Signed-off-by: Toshiaki Makita 
> >> ---
> >>  drivers/net/veth.c | 26 +-
> >>  1 file changed, 25 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> >> index 52110e5..4edc75f 100644
> >> --- a/drivers/net/veth.c
> >> +++ b/drivers/net/veth.c
> >> @@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int 
> >> n,
> >>return ret;
> >>  }
> >>  
> >> +static void veth_xdp_flush_bq(struct net_device *dev)
> >> +{
> >> +  struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
> >> +  int sent, i, err = 0;
> >> +
> >> +  sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);  
> > 
> > Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
> > you're introducing an additional per-cpu bulk queue, only to avoid lock
> > contention around the existing pointer ring. But the pointer ring is
> > per-rq, so if you have lock contention, this means you must have
> > multiple CPUs servicing the same rq, no?  
> 
> Yes, it's possible. Not recommended though.
> 

I think the general per-cpu TX bulk queue is overkill.  There is a loop
over packets in veth_xdp_rcv(struct veth_rq *rq, budget, *status), and
the caller veth_poll() will call veth_xdp_flush(rq->dev).

Why can't you store this "temp" bulk array in struct veth_rq ?

You could even alloc/create it on the stack of veth_poll() and send it
along via a pointer to veth_xdp_rcv).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toshiaki Makita


On 19/05/23 (木) 21:18:25, Toke Høiland-Jørgensen wrote:

Toshiaki Makita  writes:


On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote:

Toshiaki Makita  writes:


This improves XDP_TX performance by about 8%.

Here are single core XDP_TX test results. CPU consumptions are taken
from "perf report --no-child".

- Before:

   7.26 Mpps

   _raw_spin_lock  7.83%
   veth_xdp_xmit  12.23%

- After:

   7.84 Mpps

   _raw_spin_lock  1.17%
   veth_xdp_xmit   6.45%

Signed-off-by: Toshiaki Makita 
---
  drivers/net/veth.c | 26 +-
  1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 52110e5..4edc75f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
return ret;
  }
  
+static void veth_xdp_flush_bq(struct net_device *dev)

+{
+   struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
+   int sent, i, err = 0;
+
+   sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);


Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
you're introducing an additional per-cpu bulk queue, only to avoid lock
contention around the existing pointer ring. But the pointer ring is
per-rq, so if you have lock contention, this means you must have
multiple CPUs servicing the same rq, no?


Yes, it's possible. Not recommended though.


So why not just fix that instead?


The queues are shared with packets from stack sent from peer. That's
because I needed the lock. I have tried to separate the queues, one for
redirect and one for stack, but receiver side got too complicated and it
ended up with worse performance.


I meant fix it with configuration. Now many receive queues are you
running on the veth device in your benchmarks, and how have you
configured the RPS?


As I wrote this test is a single queue test and does not have any 
contention.

Per packet lock has some overhead even in that configuration.

Toshiaki Makita

Re: [PATCH bpf-next 3/3] veth: Support bulk XDP_TX

2019-05-23 Thread Toshiaki Makita


On 19/05/23 (木) 22:29:27, Jesper Dangaard Brouer wrote:

On Thu, 23 May 2019 20:35:50 +0900
Toshiaki Makita  wrote:


On 2019/05/23 20:25, Toke Høiland-Jørgensen wrote:

Toshiaki Makita  writes:
   

This improves XDP_TX performance by about 8%.

Here are single core XDP_TX test results. CPU consumptions are taken
from "perf report --no-child".

- Before:

   7.26 Mpps

   _raw_spin_lock  7.83%
   veth_xdp_xmit  12.23%

- After:

   7.84 Mpps

   _raw_spin_lock  1.17%
   veth_xdp_xmit   6.45%

Signed-off-by: Toshiaki Makita 
---
  drivers/net/veth.c | 26 +-
  1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 52110e5..4edc75f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -442,6 +442,23 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
return ret;
  }
  
+static void veth_xdp_flush_bq(struct net_device *dev)

+{
+   struct xdp_tx_bulk_queue *bq = this_cpu_ptr(&xdp_tx_bq);
+   int sent, i, err = 0;
+
+   sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);


Wait, veth_xdp_xmit() is just putting frames on a pointer ring. So
you're introducing an additional per-cpu bulk queue, only to avoid lock
contention around the existing pointer ring. But the pointer ring is
per-rq, so if you have lock contention, this means you must have
multiple CPUs servicing the same rq, no?


Yes, it's possible. Not recommended though.



I think the general per-cpu TX bulk queue is overkill.  There is a loop
over packets in veth_xdp_rcv(struct veth_rq *rq, budget, *status), and
the caller veth_poll() will call veth_xdp_flush(rq->dev).

Why can't you store this "temp" bulk array in struct veth_rq ?


Of course I can. But I thought tun has the same problem and we can 
decrease memory footprint by sharing the same storage between devices.
Or if other devices want to reduce queues so that we can use XDP on 
many-cpu servers and introduce locks, we can use this storage for that 
case as well.


Still do you prefer veth-specific solution?



You could even alloc/create it on the stack of veth_poll() and send it
along via a pointer to veth_xdp_rcv).



Toshiaki Makita

[PATCH net-next] cxgb4: use firmware API for validating filter spec

2019-05-23 Thread Raju Rangoju

Adds support for validating hardware filter spec configured in firmware
before offloading exact match flows.

Use the new fw api FW_PARAM_DEV_FILTER_MODE_MASK to read the filter mode
and mask from firmware. If the api isn't supported, then fall-back to
older way of reading just the mode from indirect register.

Signed-off-by: Raju Rangoju 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h|  1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c |  3 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c| 47 ---
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 23 +++
 4 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index a707a65f491b..7c06e2aebc9e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -280,6 +280,7 @@ struct tp_params {
unsigned short tx_modq[NCHAN];  /* channel to modulation queue map */
 
u32 vlan_pri_map;   /* cached TP_VLAN_PRI_MAP */
+   u32 filter_mask;
u32 ingress_config; /* cached TP_INGRESS_CONFIG */
 
/* cached TP_OUT_CONFIG compressed error vector
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 0f0b3f4a039d..6232236d7abc 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -248,8 +248,9 @@ static int validate_filter(struct net_device *dev,
u32 fconf, iconf;
 
/* Check for unconfigured fields being used. */
-   fconf = adapter->params.tp.vlan_pri_map;
iconf = adapter->params.tp.ingress_config;
+   fconf = fs->hash ? adapter->params.tp.filter_mask :
+  adapter->params.tp.vlan_pri_map;
 
if (unsupported(fconf, FCOE_F, fs->val.fcoe, fs->mask.fcoe) ||
unsupported(fconf, PORT_F, fs->val.iport, fs->mask.iport) ||
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 866ee31a1da5..8c17d1c75e84 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -9388,8 +9388,9 @@ int t4_init_sge_params(struct adapter *adapter)
  */
 int t4_init_tp_params(struct adapter *adap, bool sleep_ok)
 {
-   int chan;
-   u32 v;
+   u32 param, val, v;
+   int chan, ret;
+
 
v = t4_read_reg(adap, TP_TIMER_RESOLUTION_A);
adap->params.tp.tre = TIMERRESOLUTION_G(v);
@@ -9399,11 +9400,47 @@ int t4_init_tp_params(struct adapter *adap, bool 
sleep_ok)
for (chan = 0; chan < NCHAN; chan++)
adap->params.tp.tx_modq[chan] = chan;
 
-   /* Cache the adapter's Compressed Filter Mode and global Incress
+   /* Cache the adapter's Compressed Filter Mode/Mask and global Ingress
 * Configuration.
 */
-   t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1,
-  TP_VLAN_PRI_MAP_A, sleep_ok);
+   param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_FILTER) |
+FW_PARAMS_PARAM_Y_V(FW_PARAM_DEV_FILTER_MODE_MASK));
+
+   /* Read current value */
+   ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 1,
+ ¶m, &val);
+   if (ret == 0) {
+   dev_info(adap->pdev_dev,
+"Current filter mode/mask 0x%x:0x%x\n",
+FW_PARAMS_PARAM_FILTER_MODE_G(val),
+FW_PARAMS_PARAM_FILTER_MASK_G(val));
+   adap->params.tp.vlan_pri_map =
+   FW_PARAMS_PARAM_FILTER_MODE_G(val);
+   adap->params.tp.filter_mask =
+   FW_PARAMS_PARAM_FILTER_MASK_G(val);
+   } else {
+   dev_info(adap->pdev_dev,
+"Failed to read filter mode/mask via fw api, using 
indirect-reg-read\n");
+
+   /* Incase of older-fw (which doesn't expose the api
+* FW_PARAM_DEV_FILTER_MODE_MASK) and newer-driver (which uses
+* the fw api) combination, fall-back to older method of reading
+* the filter mode from indirect-register
+*/
+   t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1,
+  TP_VLAN_PRI_MAP_A, sleep_ok);
+
+   /* With the older-fw and newer-driver combination we might run
+* into an issue when user wants to use hash filter region but
+* the filter_mask is zero, in this case filter_mask validation
+* is tough. To avoid that we set the filter_mask same as filter
+* mode, which will behave exactly as the older way of ignoring
+* the filter mask validation.
+*/
+   adap->params.tp.filter_

Re: Question about IRQs during the .remove() of virtio-vsock driver

2019-05-23 Thread Stefano Garzarella

On Wed, May 22, 2019 at 11:44:26AM +0800, Jason Wang wrote:
> 
> On 2019/5/21 下午9:56, Michael S. Tsirkin wrote:
> > On Tue, May 21, 2019 at 03:49:20PM +0200, Stefano Garzarella wrote:
> > > On Tue, May 21, 2019 at 06:05:31AM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, May 21, 2019 at 11:44:07AM +0200, Stefano Garzarella wrote:
> > > > > Hi Micheal, Jason,
> > > > > as suggested by Stefan, I'm checking if we have some races in the
> > > > > virtio-vsock driver. We found some races in the .probe() and .remove()
> > > > > with the upper layer (socket) and I'll fix it.
> > > > > 
> > > > > Now my attention is on the bottom layer (virtio device) and my 
> > > > > question is:
> > > > > during the .remove() of virtio-vsock driver (virtio_vsock_remove), 
> > > > > could happen
> > > > > that an IRQ comes and one of our callback (e.g. 
> > > > > virtio_vsock_rx_done()) is
> > > > > executed, queueing new works?
> > > > > 
> > > > > I tried to follow the code in both cases (device unplugged or module 
> > > > > removed)
> > > > > and maybe it couldn't happen because we remove it from bus's 
> > > > > knowledge,
> > > > > but I'm not sure and your advice would be very helpful.
> > > > > 
> > > > > Thanks in advance,
> > > > > Stefano
> > > > 
> > > > Great question! This should be better documented: patches welcome!
> > > When I'm clear, I'll be happy to document this.
> > > 
> > > > Here's my understanding:
> > > > 
> > > > 
> > > > A typical removal flow works like this:
> > > > 
> > > > - prevent linux from sending new kick requests to device
> > > >and flush such outstanding requests if any
> > > >(device can still send notifications to linux)
> > > > 
> > > > - call
> > > >vi->vdev->config->reset(vi->vdev);
> > > >this will flush all device writes and interrupts.
> > > >device will not use any more buffers.
> > > >previously outstanding callbacks might still be active.
> > > > 
> > > > - Then call
> > > >vdev->config->del_vqs(vdev);
> > > >to flush outstanding callbacks if any.
> > > Thanks for sharing these useful information.
> > > 
> > > So, IIUC between step 1 (e.g. in virtio-vsock we flush all work-queues) 
> > > and
> > > step 2, new IRQs could happen, and in the virtio-vsock driver new work
> > > will be queued.
> > > 
> > > In order to handle this case, I'm thinking to add a new variable
> > > 'work_enabled' in the struct virtio_vsock, put it to false at the start
> > > of the .remove(), then call synchronize_rcu() before to flush all work
> > > queues
> > > and use an helper function virtio_transport_queue_work() to queue
> > > a new work, where the check of work_enabled and the queue_work are in the
> > > RCU read critical section.
> > > 
> > > Here a pseudo code to explain better the idea:
> > > 
> > > virtio_vsock_remove() {
> > >  vsock->work_enabled = false;
> > > 
> > >  /* Wait for other CPUs to finish to queue works */
> > >  synchronize_rcu();
> > > 
> > >  flush_works();
> > > 
> > >  vdev->config->reset(vdev);
> > > 
> > >  ...
> > > 
> > >  vdev->config->del_vqs(vdev);
> > > }
> > > 
> > > virtio_vsock_queue_work(vsock, work) {
> > >  rcu_read_lock();
> > > 
> > >  if (!vsock->work_enabled) {
> > >  goto out;
> > >  }
> > > 
> > >  queue_work(virtio_vsock_workqueue, work);
> > > 
> > > out:
> > >  rcu_read_unlock();
> > > }
> > > 
> > > 
> > > Do you think can work?
> > > Please tell me if there is a better way to handle this case.
> > > 
> > > Thanks,
> > > Stefano
> > 
> > instead of rcu tricks I would just have rx_run and tx_run and check it
> > within the queued work - presumably under tx or rx lock.
> > 
> > then queueing an extra work becomes harmless,
> > and you flush it after del vqs which flushes everything for you.
> > 
> > 
> 
> It looks to me that we need guarantee no work queued or scheduled before
> del_vqs. Otherwise it may lead use after free? (E.g net disable NAPI before
> del_vqs).
> 

I'm disabling works before del_vqs (setting tx_run or rx_run to false under the
tx or rx lock), this means that work can still be queued, but the worker
function will exit just after tx or rx lock is acquired, where I'm
checking the tx_run or rx_run, so no workers should use the virtqueues.

Then I'll call del_vqs and I'll flush all works before to free the
'vsock' object (struct virtio_vsock) to be sure that no workers will run.

I hope this will match your advice.

Thanks,
Stefano

Re: [PATCH bpf] bpf, riscv: clear target register high 32-bits for and/or/xor on ALU32

2019-05-23 Thread Daniel Borkmann

On 05/21/2019 04:12 PM, Björn Töpel wrote:
> On Tue, 21 May 2019 at 16:02, Daniel Borkmann  wrote:
>> On 05/21/2019 03:46 PM, Björn Töpel wrote:
>>> When using 32-bit subregisters (ALU32), the RISC-V JIT would not clear
>>> the high 32-bits of the target register and therefore generate
>>> incorrect code.
>>>
>>> E.g., in the following code:
>>>
>>>   $ cat test.c
>>>   unsigned int f(unsigned long long a,
>>>  unsigned int b)
>>>   {
>>>   return (unsigned int)a & b;
>>>   }
>>>
>>>   $ clang-9 -target bpf -O2 -emit-llvm -S test.c -o - | \
>>>   llc-9 -mattr=+alu32 -mcpu=v3
>>>   .text
>>>   .file   "test.c"
>>>   .globl  f
>>>   .p2align3
>>>   .type   f,@function
>>>   f:
>>>   r0 = r1
>>>   w0 &= w2
>>>   exit
>>>   .Lfunc_end0:
>>>   .size   f, .Lfunc_end0-f
>>>
>>> The JIT would not clear the high 32-bits of r0 after the
>>> and-operation, which in this case might give an incorrect return
>>> value.
>>>
>>> After this patch, that is not the case, and the upper 32-bits are
>>> cleared.
>>>
>>> Reported-by: Jiong Wang 
>>> Fixes: 2353ecc6f91f ("bpf, riscv: add BPF JIT for RV64G")
>>> Signed-off-by: Björn Töpel 
>>
>> Was this missed because test_verifier did not have test coverage?
> 
> Yup, and Jiong noted it.

Applied, thanks!

Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor

2019-05-23 Thread Daniel Borkmann

On 05/23/2019 08:38 AM, Y Song wrote:
> On Wed, May 22, 2019 at 1:46 PM Björn Töpel  wrote:
>> On Wed, 22 May 2019 at 20:13, Y Song  wrote:
>>> On Wed, May 22, 2019 at 2:25 AM Björn Töpel  wrote:

 Add three tests to test_verifier/basic_instr that make sure that the
 high 32-bits of the destination register is cleared after an ALU32
 and/or/xor.

 Signed-off-by: Björn Töpel 
>>>
>>> I think the patch intends for bpf-next, right? The patch itself looks
>>> good to me.
>>> Acked-by: Yonghong Song 
>>
>> Thank you. Actually, it was intended for the bpf tree, as a test
>> follow up for this [1] fix.
> Then maybe you want to add a Fixes tag and resubmit?

Why would the test case need a fixes tag? It's common practice that we have
BPF fixes that we queue to bpf tree along with kselftest test cases related
to them. Therefore, applied as well, thanks for following up!

Björn, in my email from the fix, I mentioned we should have test snippets
ideally for all of the alu32 insns to not miss something falling through the
cracks when JITs get added or changed. If you have some cycles to add the
remaining missing ones, that would be much appreciated.

Thanks,
Daniel

Re: [bpf PATCH] bpf: sockmap, restore sk_write_space when psock gets dropped

2019-05-23 Thread Daniel Borkmann

On 05/22/2019 12:01 PM, Jakub Sitnicki wrote:
> Once psock gets unlinked from its sock (sk_psock_drop), user-space can
> still trigger a call to sk->sk_write_space by setting TCP_NOTSENT_LOWAT
> socket option. This causes a null-ptr-deref because we try to read
> psock->saved_write_space from sk_psock_write_space:
> 
> ==
> BUG: KASAN: null-ptr-deref in sk_psock_write_space+0x69/0x80
> Read of size 8 at addr 01a0 by task sockmap-echo/131
> 
> CPU: 0 PID: 131 Comm: sockmap-echo Not tainted 5.2.0-rc1-00094-gf49aa1de9836 
> #81
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> ?-20180724_192412-buildhw-07.phx2.fedoraproject.org-1.fc29 04/01/2014
> Call Trace:
>  ? sk_psock_write_space+0x69/0x80
>  __kasan_report.cold.2+0x5/0x3f
>  ? sk_psock_write_space+0x69/0x80
>  kasan_report+0xe/0x20
>  sk_psock_write_space+0x69/0x80
>  tcp_setsockopt+0x69a/0xfc0
>  ? tcp_shutdown+0x70/0x70
>  ? fsnotify+0x5b0/0x5f0
>  ? remove_wait_queue+0x90/0x90
>  ? __fget_light+0xa5/0xf0
>  __sys_setsockopt+0xe6/0x180
>  ? sockfd_lookup_light+0xb0/0xb0
>  ? vfs_write+0x195/0x210
>  ? ksys_write+0xc9/0x150
>  ? __x64_sys_read+0x50/0x50
>  ? __bpf_trace_x86_fpu+0x10/0x10
>  __x64_sys_setsockopt+0x61/0x70
>  do_syscall_64+0xc5/0x520
>  ? vmacache_find+0xc0/0x110
>  ? syscall_return_slowpath+0x110/0x110
>  ? handle_mm_fault+0xb4/0x110
>  ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
>  ? trace_hardirqs_off_caller+0x4b/0x120
>  ? trace_hardirqs_off_thunk+0x1a/0x3a
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x7f2e5e7cdcce
> Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 66 2e 0f 1f 84 00 00 00 00 00
> 0f 1f 44 00 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 01 f0 ff
> ff 73 01 c3 48 8b 0d 8a 11 0c 00 f7 d8 64 89 01 48
> RSP: 002b:7ffed011b778 EFLAGS: 0206 ORIG_RAX: 0036
> RAX: ffda RBX: 0003 RCX: 7f2e5e7cdcce
> RDX: 0019 RSI: 0006 RDI: 0007
> RBP: 7ffed011b790 R08: 0004 R09: 7f2e5e84ee80
> R10: 7ffed011b788 R11: 0206 R12: 7ffed011b78c
> R13: 7ffed011b788 R14: 0007 R15: 0068
> ==
> 
> Restore the saved sk_write_space callback when psock is being dropped to
> fix the crash.
> 
> Signed-off-by: Jakub Sitnicki 

LGTM, applied thanks!

Re: [PATCH v7 bpf-next 01/16] bpf: verifier: mark verified-insn with sub-register zext flag

2019-05-23 Thread Jiong Wang



> On 23 May 2019, at 03:07, Alexei Starovoitov  
> wrote:
> 
> On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote:
>> eBPF ISA specification requires high 32-bit cleared when low 32-bit
>> sub-register is written. This applies to destination register of ALU32 etc.
>> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
>> AArch64 ISA has the same semantics, so the corresponding JIT back-end
>> doesn't need to do extra work.
>> 
>> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
>> (PowerPC, SPARC etc) need to do explicit zero extension to meet this
>> requirement, otherwise code like the following will fail.
>> 
>>  u64_value = (u64) u32_value
>>  ... other uses of u64_value
>> 
>> This is because compiler could exploit the semantic described above and
>> save those zero extensions for extending u32_value to u64_value, these JIT
>> back-ends are expected to guarantee this through inserting extra zero
>> extensions which however could be a significant increase on the code size.
>> Some benchmarks show there could be ~40% sub-register writes out of total
>> insns, meaning at least ~40% extra code-gen.
>> 
>> One observation is these extra zero extensions are not always necessary.
>> Take above code snippet for example, it is possible u32_value will never be
>> casted into a u64, the value of high 32-bit of u32_value then could be
>> ignored and extra zero extension could be eliminated.
>> 
>> This patch implements this idea, insns defining sub-registers will be
>> marked when the high 32-bit of the defined sub-register matters. For
>> those unmarked insns, it is safe to eliminate high 32-bit clearnace for
>> them.
>> 
>> Algo:
>> - Split read flags into READ32 and READ64.
>> 
>> - Record index of insn that does sub-register write. Keep the index inside
>>   reg state and update it during verifier insn walking.
>> 
>> - A full register read on a sub-register marks its definition insn as
>>   needing zero extension on dst register.
>> 
>>   A new sub-register write overrides the old one.
>> 
>> - When propagating read64 during path pruning, also mark any insn defining
>>   a sub-register that is read in the pruned path as full-register.
>> 
>> Reviewed-by: Jakub Kicinski 
>> Signed-off-by: Jiong Wang 
>> ---
>> include/linux/bpf_verifier.h |  14 +++-
>> kernel/bpf/verifier.c| 175 
>> +++
>> 2 files changed, 173 insertions(+), 16 deletions(-)
>> 
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 1305ccb..60fb54e 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -36,9 +36,11 @@
>>  */
>> enum bpf_reg_liveness {
>>  REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
>> -REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
>> -REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
>> -REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore 
>> */
>> +REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial 
>> value */
>> +REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
>> +REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
>> +REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later 
>> reads */
>> +REG_LIVE_DONE = 0x8, /* liveness won't be updating this register 
>> anymore */
>> };
>> 
>> struct bpf_reg_state {
>> @@ -131,6 +133,11 @@ struct bpf_reg_state {
>>   * pointing to bpf_func_state.
>>   */
>>  u32 frameno;
>> +/* Tracks subreg definition. The stored value is the insn_idx of the
>> + * writing insn. This is safe because subreg_def is used before any insn
>> + * patching which only happens after main verification finished.
>> + */
>> +s32 subreg_def;
>>  enum bpf_reg_liveness live;
>> };
>> 
>> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data {
>>  int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
>>  int sanitize_stack_off; /* stack slot to be cleared */
>>  bool seen; /* this insn was processed by the verifier */
>> +bool zext_dst; /* this insn zero extends dst reg */
>>  u8 alu_state; /* used in combination with alu_limit */
>>  unsigned int orig_idx; /* original instruction index */
>> };
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 95f93544..0efccf8 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env 
>> *env,
>>  __mark_reg_not_init(regs + regno);
>> }
>> 
>> +#define DEF_NOT_SUBREG  (-1)
>> static void init_reg_state(struct bpf_verifier_env *env,
>> struct bpf_func_state *state)
>> {
>> @@ -991,6 +992,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
>>  mark_reg_not_init(env, regs, i);
>>  regs[i].live = REG_LIV

Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor

2019-05-23 Thread Jiong Wang



> On 23 May 2019, at 15:02, Daniel Borkmann  wrote:
> 
> On 05/23/2019 08:38 AM, Y Song wrote:
>> On Wed, May 22, 2019 at 1:46 PM Björn Töpel  wrote:
>>> On Wed, 22 May 2019 at 20:13, Y Song  wrote:
 On Wed, May 22, 2019 at 2:25 AM Björn Töpel  wrote:
> 
> Add three tests to test_verifier/basic_instr that make sure that the
> high 32-bits of the destination register is cleared after an ALU32
> and/or/xor.
> 
> Signed-off-by: Björn Töpel 
 
 I think the patch intends for bpf-next, right? The patch itself looks
 good to me.
 Acked-by: Yonghong Song 
>>> 
>>> Thank you. Actually, it was intended for the bpf tree, as a test
>>> follow up for this [1] fix.
>> Then maybe you want to add a Fixes tag and resubmit?
> 
> Why would the test case need a fixes tag? It's common practice that we have
> BPF fixes that we queue to bpf tree along with kselftest test cases related
> to them. Therefore, applied as well, thanks for following up!
> 
> Björn, in my email from the fix, I mentioned we should have test snippets
> ideally for all of the alu32 insns to not miss something falling through the
> cracks when JITs get added or changed. If you have some cycles to add the
> remaining missing ones, that would be much appreciated.

Björn,

  If you don’t have time, I can take this alu32 test case follow up as well.

Regards,
Jiong

> 
> Thanks,
> Daniel

Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw

2019-05-23 Thread Florian Fainelli




On 5/23/2019 5:10 AM, Ioana Ciornei wrote:
> 
>> Subject: Re: [RFC PATCH net-next 5/9] net: phylink: Add phylink_create_raw
>>
>>
>>
>> On 5/22/2019 7:25 PM, Florian Fainelli wrote:
>>>
>>>
>>> On 5/22/2019 6:20 PM, Ioana Ciornei wrote:
 This adds a new entry point to PHYLINK that does not require a
 net_device structure.

 The main intended use are DSA ports that do not have net devices
 registered for them (mainly because doing so would be redundant - see
 Documentation/networking/dsa/dsa.rst for details). So far DSA has
 been using PHYLIB fixed PHYs for these ports, driven manually with
 genphy instead of starting a full PHY state machine, but this does
 not scale well when there are actual PHYs that need a driver on those
 ports, or when a fixed-link is requested in DT that has a speed
 unsupported by the fixed PHY C22 emulation (such as SGMII-2500).

 The proposed solution comes in the form of a notifier chain owned by
 the PHYLINK instance, and the passing of phylink_notifier_info
 structures back to the driver through a blocking notifier call.

 The event API exposed by the new notifier mechanism is a 1:1 mapping
 to the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback.

 Both the standard phylink_create() function, as well as its raw
 variant, call the same underlying function which initializes either
 the netdev field or the notifier block of the PHYLINK instance.

 All PHYLINK driver callbacks have been extended to call the notifier
 chain in case the instance is a raw one.

 Signed-off-by: Ioana Ciornei 
 Signed-off-by: Vladimir Oltean 
 ---
>>>
>>> [snip]
>>>
 +  struct phylink_notifier_info info = {
 +  .link_an_mode = pl->link_an_mode,
 +  /* Discard const pointer */
 +  .state = (struct phylink_link_state *)state,
 +  };
 +
netdev_dbg(pl->netdev,
   "%s: mode=%s/%s/%s/%s adv=%*pb pause=%02x link=%u
>> an=%u\n",
   __func__, phylink_an_mode_str(pl->link_an_mode),
 @@ -299,7 +317,12 @@ static void phylink_mac_config(struct phylink *pl,
   __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
   state->pause, state->link, state->an_enabled);
>>>
>>> Don't you need to guard that netdev_dbg() with an if (pl->ops) to
>>> avoid de-referencing a NULL net_device?
>>>
> 
> 
> The netdev_* print will not dereference a NULL net_device since it has 
> explicit checks agains this.
> Instead it will just print (net/core/dev.c, __netdev_printk):
> 
>   printk("%s(NULL net_device): %pV", level, vaf);
> 
> 
>>> Another possibility could be to change the signature of the
>>> phylink_mac_ops to take an opaque pointer and in the case where we
>>> called phylink_create() and passed down a net_device pointer, we
>>> somehow remember that for doing any operation that requires a
>>> net_device (printing, setting carrier). We lose strict typing in doing
>>> that, but we'd have fewer places to patch for a blocking notifier call.
>>>
>>
>> Or even make those functions part of phylink_mac_ops such that the caller
>> could pass an .carrier_ok callback which is netif_carrier_ok() for a 
>> net_device,
>> else it's NULL, same with printing functions if desired...
>> --
>> Florian
> 
> 
> Let me see if I understood this correctly. I presume that any API that we add 
> should not break any current PHYLINK users.
> 
> You suggest to change the prototype of the phylink_mac_ops from
> 
>   void (*validate)(struct net_device *ndev, unsigned long *supported,
>struct phylink_link_state *state);
> 
> to something that takes a void pointer:
> 
>   void (*validate)(void *dev, unsigned long *supported,
>struct phylink_link_state *state);

That is what I am suggesting, but I am also suggesting passing all
netdev specific calls that must be made as callbacks as well, so
something like:

bool (*carrier_ok)(const void *dev)
void (*carrier_set)(const void *dev, bool on)
void (*print)(const void *dev, const char *fmt)

as new members of phylink_mac_ops.

> 
> This would imply that the any function in PHYLINK would have to somehow 
> differentiate if the dev provided is indeed a net_device or another structure 
> in order to make the decision if netif_carrier_off should be called or not 
> (this is so we do not break any drivers using PHYLINK). I cannot see how this 
> judgement can be made.

You don't have to make the judgement you can just do:

if (pl->ops->carrier_set)
pl->ops->carrier_set(dev,

where dev was this opaque pointer passed to phylink_create() the first
time it was created. Like I wrote, we lose strong typing doing that, but
we don't have to update all code paths for if (pl->ops) else notifier.
-- 
Florian

Re: [PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options

2019-05-23 Thread Andrew Lunn

On Thu, May 23, 2019 at 11:56:45AM +0200, Maxime Ripard wrote:
> The networking PHYs have a number of available device tree properties that
> can be used in their device tree node. Add a YAML schemas for those.
> 
> Signed-off-by: Maxime Ripard 
> ---
>  Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +-
>  Documentation/devicetree/bindings/net/phy.txt   |  80 +-
>  2 files changed, 149 insertions(+), 79 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml
> 
> diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml 
> b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
> new file mode 100644
> index ..eb79ee6db977
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
> @@ -0,0 +1,148 @@
> +# SPDX-License-Identifier: GPL-2.0
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/ethernet-phy.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Ethernet PHY Generic Binding
> +
> +maintainers:
> +  - David S. Miller 
> +
> +properties:
> +  $nodename:
> +pattern: "^ethernet-phy(@[a-f0-9])?$"
> +
> +  compatible:
> +oneOf:

I don't know the language. It is valid to have both
ethernet-phy-ieee802.3-c45 and
ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$".  Does this oneOf prevent
multiple compatible strings?

Also, the general case is no compatible at all.

> +  - const: ethernet-phy-ieee802.3-c22
> +description: PHYs that implement IEEE802.3 clause 22
> +  - const: ethernet-phy-ieee802.3-c45
> +description: PHYs that implement IEEE802.3 clause 45
> +  - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
> +description:
> +  The first group of digits is the 16 bit Phy Identifier 1
> +  register, this is the chip vendor OUI bits 3:18. The
> +  second group of digits is the Phy Identifier 2 register,
> +  this is the chip vendor OUI bits 19:24, followed by 10
> +  bits of a vendor specific ID.

Could we try to retain:

> -  If the PHY reports an incorrect ID (or none at all) then the
> -  "compatible" list may contain an entry with the correct PHY ID in the
 ...

Using it is generally wrong, and that is not clear in the new text.

> +
> +  reg:
> +maxItems: 1
> +minimum: 0
> +maximum: 31
> +description:
> +  The ID number for the PHY.
> +
> +  interrupts:
> +maxItems: 1
> +
> +  max-speed:
> +enum:
> +  - 10
> +  - 100
> +  - 1000

This is outdated in the text description. Any valid speed is
supported, currently 10 - 20, as listed in phy_setting settings().

   Andrew

[PATCH net-next] Revert "dpaa2-eth: configure the cache stashing amount on a queue"

2019-05-23 Thread Ioana Radulescu

This reverts commit f8b995853444aba9c16c1ccdccdd397527fde96d.

The reverted change instructed the QMan hardware block to fetch
RX frame annotation and beginning of frame data to cache before
the core would read them.

It turns out that in rare cases, it's possible that a QMan
stashing transaction is delayed long enough such that, by the time
it gets executed, the frame in question had already been dequeued
by the core and software processing began on it. If the core
manages to unmap the frame buffer _before_ the stashing transaction
is executed, an SMMU exception will be raised.

Unfortunately there is no easy way to work around this while keeping
the performance advantages brought by QMan stashing, so disable
it altogether.

Signed-off-by: Ioana Radulescu 
---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 63b1ecc1..28a6faa 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -2479,14 +2479,9 @@ static int setup_rx_flow(struct dpaa2_eth_priv *priv,
queue.destination.type = DPNI_DEST_DPCON;
queue.destination.priority = 1;
queue.user_context = (u64)(uintptr_t)fq;
-   queue.flc.stash_control = 1;
-   queue.flc.value &= 0xFFC0;
-   /* 01 01 00 - data, annotation, flow context */
-   queue.flc.value |= 0x14;
err = dpni_set_queue(priv->mc_io, 0, priv->mc_token,
 DPNI_QUEUE_RX, 0, fq->flowid,
-DPNI_QUEUE_OPT_USER_CTX | DPNI_QUEUE_OPT_DEST |
-DPNI_QUEUE_OPT_FLC,
+DPNI_QUEUE_OPT_USER_CTX | DPNI_QUEUE_OPT_DEST,
 &queue);
if (err) {
dev_err(dev, "dpni_set_queue(RX) failed\n");
-- 
2.7.4

Re: [PATCH 1/8] dt-bindings: net: Add YAML schemas for the generic Ethernet options

2019-05-23 Thread Andrew Lunn

> > +link-gpios:
> > +  description:
> > +GPIO to determine if the link is up
> 
> Only 1?

Hi Rob

Yes, only one.

Andrew

Re: [PATCH v2 bpf-next 0/3] bpf: increase jmp sequence limit

2019-05-23 Thread Daniel Borkmann

On 05/22/2019 05:14 AM, Alexei Starovoitov wrote:
> Patch 1 - jmp sequence limit
> Patch 2 - improve existing tests
> Patch 3 - add pyperf-based realistic bpf program that takes advantage
> of higher limit and use it as a stress test
> 
> v1->v2: fixed nit in patch 3. added Andrii's acks
> 
> Alexei Starovoitov (3):
>   bpf: bump jmp sequence limit
>   selftests/bpf: adjust verifier scale test
>   selftests/bpf: add pyperf scale test
> 
>  kernel/bpf/verifier.c |   7 +-
>  .../bpf/prog_tests/bpf_verif_scale.c  |  31 +-
>  tools/testing/selftests/bpf/progs/pyperf.h| 268 ++
>  tools/testing/selftests/bpf/progs/pyperf100.c |   4 +
>  tools/testing/selftests/bpf/progs/pyperf180.c |   4 +
>  tools/testing/selftests/bpf/progs/pyperf50.c  |   4 +
>  tools/testing/selftests/bpf/test_verifier.c   |  31 +-
>  7 files changed, 319 insertions(+), 30 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/progs/pyperf.h
>  create mode 100644 tools/testing/selftests/bpf/progs/pyperf100.c
>  create mode 100644 tools/testing/selftests/bpf/progs/pyperf180.c
>  create mode 100644 tools/testing/selftests/bpf/progs/pyperf50.c
> 

Looks good, applied, thanks!

Re: [PATCH 2/8] dt-bindings: net: Add a YAML schemas for the generic PHY options

2019-05-23 Thread Rob Herring

On Thu, May 23, 2019 at 4:57 AM Maxime Ripard  wrote:
>
> The networking PHYs have a number of available device tree properties that
> can be used in their device tree node. Add a YAML schemas for those.
>
> Signed-off-by: Maxime Ripard 
> ---
>  Documentation/devicetree/bindings/net/ethernet-phy.yaml | 148 +-
>  Documentation/devicetree/bindings/net/phy.txt   |  80 +-
>  2 files changed, 149 insertions(+), 79 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/ethernet-phy.yaml
>
> diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml 
> b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
> new file mode 100644
> index ..eb79ee6db977
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
> @@ -0,0 +1,148 @@
> +# SPDX-License-Identifier: GPL-2.0
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/ethernet-phy.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Ethernet PHY Generic Binding
> +
> +maintainers:
> +  - David S. Miller 
> +
> +properties:
> +  $nodename:
> +pattern: "^ethernet-phy(@[a-f0-9])?$"
> +
> +  compatible:
> +oneOf:
> +  - const: ethernet-phy-ieee802.3-c22
> +description: PHYs that implement IEEE802.3 clause 22
> +  - const: ethernet-phy-ieee802.3-c45
> +description: PHYs that implement IEEE802.3 clause 45
> +  - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
> +description:
> +  The first group of digits is the 16 bit Phy Identifier 1
> +  register, this is the chip vendor OUI bits 3:18. The
> +  second group of digits is the Phy Identifier 2 register,
> +  this is the chip vendor OUI bits 19:24, followed by 10
> +  bits of a vendor specific ID.
> +
> +  reg:
> +maxItems: 1
> +minimum: 0
> +maximum: 31

min/max need to be under 'items'. I don't think these would ever be
valid if the type is an array.

I've modified the meta-schema to catch this.

> +description:
> +  The ID number for the PHY.
> +
> +  interrupts:
> +maxItems: 1
> +
> +  max-speed:
> +enum:
> +  - 10
> +  - 100
> +  - 1000
> +description:
> +  Maximum PHY supported speed in Mbits / seconds.
> +
> +  broken-turn-around:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  If set, indicates the PHY device does not correctly release
> +  the turn around line low at the end of a MDIO transaction.
> +
> +  enet-phy-lane-swap:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  If set, indicates the PHY will swap the TX/RX lanes to
> +  compensate for the board being designed with the lanes
> +  swapped.
> +
> +  eee-broken-100tx:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  eee-broken-1000t:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  eee-broken-10gt:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  eee-broken-1000kx:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  eee-broken-10gkx4:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  eee-broken-10gkr:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  Mark the corresponding energy efficient ethernet mode as
> +  broken and request the ethernet to stop advertising it.
> +
> +  phy-is-integrated:
> +$ref: /schemas/types.yaml#definitions/flag
> +description:
> +  If set, indicates that the PHY is integrated into the same
> +  physical package as the Ethernet MAC. If needed, muxers
> +  should be configured to ensure the integrated PHY is
> +  used. The absence of this property indicates the muxers
> +  should be configured so that the external PHY is used.
> +
> +  resets:
> +maxItems: 1
> +
> +  reset-names:
> +const: phy
> +
> +  reset-gpios:
> +description:
> +  The GPIO phandle and specifier for the PHY reset signal.

maxItems: 1

I have a meta-schema change to catch this, but It will require updates
to some existing cases.


> +
> +  reset-assert-us:
> +description:
> +  Delay after the reset was asserted in microseconds. If this
> +  property is missing

Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Maxime Ripard

On Thu, May 23, 2019 at 11:25:09AM +, Jose Abreu wrote:
> From: Maxime Ripard 
> Date: Thu, May 23, 2019 at 12:07:15
>
> > You can then run make dtbs_check, and those YAML files will be used to
> > validate that any devicetree using those properties are doing it
> > properly. That implies having the right node names, properties, types,
> > ranges of values when relevant, and so on.
>
> Thanks but how can one that's developing know which bindings it shall use?

I'm not quite sure what you mean here. Are you talking about which
file to use, or which property are required, or something else?

> Is this not parsed/prettified and displayed in some kind of webpage ?

Not at the moment, but it's one of the things that are made much
easier by using a formal data format.

> Just that now that the TXT is gone its kind of "strange" to look at YAML
> instead of plain text and develop/use the bindings.

Well, it's kind of the point though. Free-form text was impossible to
parse in a generic way, and you couldn't build any generic tools upon
it. YAML provides that.

Maxime

--
Maxime Ripard, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

signature.asc
Description: PGP signature

Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Alexandre Torgue


Hi Maxime

On 5/23/19 11:56 AM, Maxime Ripard wrote:

Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML
schema to enable the DT validation.

Signed-off-by: Maxime Ripard 
---


First, thanks a lot for this patch. Just one question:
We could add ranges for some properties in order to avoid "bad value" 
for a property. If I understand correctly you do it only for 
snps,dwxgmac, snps,dwxgmac-2.10 and st,spear600-gmac. Why not do it for 
all supported IPs ? (Maybe it is something that we could add later)





  Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++-
  Documentation/devicetree/bindings/net/stmmac.txt  | 179 +--
  2 files changed, 345 insertions(+), 178 deletions(-)
  create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml

diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml 
b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
new file mode 100644
index ..be3ada5121e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/snps,dwmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DesignWare MAC Device Tree Bindings
+
+maintainers:
+  - Alexandre Torgue 
+  - Giuseppe Cavallaro 
+  - Jose Abreu 
+
+properties:
+  compatible:
+oneOf:
+  - const: snps,dwmac
+  - const: snps,dwmac-3.50a
+  - const: snps,dwmac-3.610
+  - const: snps,dwmac-3.70a
+  - const: snps,dwmac-3.710
+  - const: snps,dwmac-4.00
+  - const: snps,dwmac-4.10a
+  - const: snps,dwxgmac
+  - const: snps,dwxgmac-2.10
+  - const: st,spear600-gmac
+description: Deprecated
+
+  reg:
+maxItems: 1
+
+  interrupts:
+minItems: 1
+maxItems: 3
+items:
+  - description: Combined signal for various interrupt events
+  - description: The interrupt to manage the remote wake-up packet 
detection
+  - description: The interrupt that occurs when Rx exits the LPI state
+
+  interrupt-names:
+minItems: 1
+maxItems: 3
+items:
+  - const: macirq
+  - const: eth_wake_irq
+  - const: eth_lpi
+
+  clocks:
+minItems: 1
+maxItems: 3
+items:
+  - description: GMAC main clock
+  - description: Peripheral registers interface clock
+  - description:
+  PTP reference clock. This clock is used for programming the
+  Timestamp Addend Register. If not passed then the system
+  clock will be used and this is fine on some platforms.
+
+  clock-names:
+additionalItems: true
+contains:
+  enum:
+- stmmaceth
+- pclk
+- ptp_ref
+
+  resets:
+maxItems: 1
+description:
+  MAC Reset signal.
+
+  reset-names:
+const: stmmaceth
+
+  snps,axi-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  AXI BUS Mode parameters. Phandle to a node that can contain the
+  following properties
+* snps,lpi_en, enable Low Power Interface
+* snps,xit_frm, unlock on WoL
+* snps,wr_osr_lmt, max write outstanding req. limit
+* snps,rd_osr_lmt, max read outstanding req. limit
+* snps,kbbe, do not cross 1KiB boundary.
+* snps,blen, this is a vector of supported burst length.
+* snps,fb, fixed-burst
+* snps,mb, mixed-burst
+* snps,rb, rebuild INCRx Burst
+
+  snps,mtl-rx-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  Multiple RX Queues parameters. Phandle to a node that can
+  contain the following properties
+* snps,rx-queues-to-use, number of RX queues to be used in the
+  driver
+* Choose one of these RX scheduling algorithms
+  * snps,rx-sched-sp, Strict priority
+  * snps,rx-sched-wsp, Weighted Strict priority
+* For each RX queue
+  * Choose one of these modes
+* snps,dcb-algorithm, Queue to be enabled as DCB
+* snps,avb-algorithm, Queue to be enabled as AVB
+  * snps,map-to-dma-channel, Channel to map
+  * Specifiy specific packet routing
+* snps,route-avcp, AV Untagged Control packets
+* snps,route-ptp, PTP Packets
+* snps,route-dcbcp, DCB Control Packets
+* snps,route-up, Untagged Packets
+* snps,route-multi-broad, Multicast & Broadcast Packets
+  * snps,priority, RX queue priority (Range 0x0 to 0xF)
+
+  snps,mtl-tx-config:
+$ref: /schemas/types.yaml#definitions/phandle
+description:
+  Multiple TX Queues parameters. Phandle to a node that can
+  contain the following properties
+* snps,tx-queues-to-use, number of TX queues to be used in the
+  driver
+* Choose one of these TX scheduling algorithms
+  * snps,tx-sched-wrr, Weighted Round Robin
+  * snps,tx-sched-wfq, Weighted

[PATCH net] cxgb4: offload VLAN flows regardless of VLAN ethtype

2019-05-23 Thread Raju Rangoju

VLAN flows never get offloaded unless ivlan_vld is set in filter spec.
It's not compulsory for vlan_ethtype to be set.

So, always enable ivlan_vld bit for offloading VLAN flows regardless of
vlan_ethtype is set or not.

Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan)
Signed-off-by: Raju Rangoju 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 6e2d80008a79..cfaf8f618d1f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -197,6 +197,9 @@ static void cxgb4_process_flow_match(struct net_device *dev,
fs->val.ivlan = vlan_tci;
fs->mask.ivlan = vlan_tci_mask;
 
+   fs->val.ivlan_vld = 1;
+   fs->mask.ivlan_vld = 1;
+
/* Chelsio adapters use ivlan_vld bit to match vlan packets
 * as 802.1Q. Also, when vlan tag is present in packets,
 * ethtype match is used then to match on ethtype of inner
@@ -207,8 +210,6 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 * ethtype value with ethtype of inner header.
 */
if (fs->val.ethtype == ETH_P_8021Q) {
-   fs->val.ivlan_vld = 1;
-   fs->mask.ivlan_vld = 1;
fs->val.ethtype = 0;
fs->mask.ethtype = 0;
}
-- 
2.9.5

Re: [RFC PATCH net-next 0/9] Decoupling PHYLINK from struct net_device

2019-05-23 Thread Maxime Chevallier

Hello Ioana,

On Thu, 23 May 2019 01:20:36 +
Ioana Ciornei  wrote:

>Following two separate discussion threads in:
>  https://www.spinics.net/lists/netdev/msg569087.html
>and:
>  https://www.spinics.net/lists/netdev/msg570450.html
>
>PHYLINK was reworked in order to add a new "raw" interface, alongside
>the one based on struct net_device. The raw interface works by passing
>structures to the owner of the phylink instance through a blocking
>notifier call.
>
>The event API exposed by the new notifier mechanism is a 1:1 mapping to
>the existing PHYLINK mac_ops, plus the PHYLINK fixed-link callback.
>
>PHYLIB (which PHYLINK uses) was reworked to the extent that it does not
>crash when connecting to a PHY and the net_device pointer is NULL.
>
>Lastly, DSA has been reworked in its way that it handles PHYs for ports
>that lack a net_device (CPU and DSA ports).  For these, it was
>previously using PHYLIB and is now using the PHYLINK raw API.

Thanks for this series, I've tested it and it does address the
particular issue I having, with the DSA fixed-link CPU port. I've been
able to use 2500 and 1 Mbps links with your series, on a board
using a 88e6390X (so, the mv88e6xxx driver).

I've also tested it on boards using PPv2, which uses phylink, and
didn't spot any issue, so this looks fine.

I'll be happy to test future versions,

Thanks,

Maxime

Re: Bug or mis configuration for mlx5e lag and multipath

2019-05-23 Thread Roi Dayan



On 20/05/2019 04:53, wenxu wrote:
> Hi Roi & Saeed,
> 
> I just test the mlx5e lag and mutipath feature. There are some suituation the 
> outgoing can't be offloaded.
> 
> ovs configureation as following.
> 
> # ovs-vsctl show
> dfd71dfb-6e22-423e-b088-d2022103af6b
>     Bridge "br0"
>     Port "mlx_pf0vf0"
>     Interface "mlx_pf0vf0"
>     Port gre
>     Interface gre
>     type: gre
>     options: {key="1000", local_ip="172.168.152.75", 
> remote_ip="172.168.152.241"}
>     Port "br0"
>     Interface "br0"
>     type: internal
> 
> set the mlx5e driver:
> 
> 
> modprobe mlx5_core
> echo 0 > /sys/class/net/eth2/device/sriov_numvfs
> echo 0 > /sys/class/net/eth3/device/sriov_numvfs
> echo 2 > /sys/class/net/eth2/device/sriov_numvfs
> echo 2 > /sys/class/net/eth3/device/sriov_numvfs
> lspci -nn | grep Mellanox
> echo :81:00.2 > /sys/bus/pci/drivers/mlx5_core/unbind
> echo :81:00.3 > /sys/bus/pci/drivers/mlx5_core/unbind
> echo :81:03.6 > /sys/bus/pci/drivers/mlx5_core/unbind
> echo :81:03.7 > /sys/bus/pci/drivers/mlx5_core/unbind
> 
> devlink dev eswitch set pci/:81:00.0  mode switchdev encap enable
> devlink dev eswitch set pci/:81:00.1  mode switchdev encap enable
> 
> modprobe bonding mode=802.3ad miimon=100 lacp_rate=1
> ip l del dev bond0
> ifconfig mlx_p0 down
> ifconfig mlx_p1 down
> ip l add dev bond0 type bond mode 802.3ad
> ifconfig bond0 172.168.152.75/24 up
> echo 1 > /sys/class/net/bond0/bonding/xmit_hash_policy
> ip l set dev mlx_p0 master bond0
> ip l set dev mlx_p1 master bond0
> ifconfig mlx_p0 up
> ifconfig mlx_p1 up
> 
> systemctl start openvswitch
> ovs-vsctl set Open_vSwitch . other_config:hw-offload=true
> systemctl restart openvswitch
> 
> 
> mlx_pf0vf0 is assigned to vm. The tc rule show in_hw
> 
> # tc filter ls dev mlx_pf0vf0 ingress
> filter protocol ip pref 2 flower
> filter protocol ip pref 2 flower handle 0x1
>   dst_mac 8e:c0:bd:bf:72:c3
>   src_mac 52:54:00:00:12:75
>   eth_type ipv4
>   ip_tos 0/3
>   ip_flags nofrag
>   in_hw
>     action order 1: tunnel_key set
>     src_ip 172.168.152.75
>     dst_ip 172.168.152.241
>     key_id 1000 pipe
>     index 2 ref 1 bind 1
>  
>     action order 2: mirred (Egress Redirect to device gre_sys) stolen
>  index 2 ref 1 bind 1
> 
> In the vm:  the mlx5e driver enable xps default (by the way I think it is 
> better not enable xps in default kernel can select queue by each flow),  in 
> the lag mode different vf queue associate with hw PF.
> 
> with command taskset -c 2 ping 10.0.0.241
> 
> the packet can be offloaded , the outgoing pf is mlx_p0
> 
> but with command taskset -c 1 ping 10.0.0.241
> 
> the packet can't be offloaded, I can capture the packet on the mlx_pf0vf0, 
> the outgoing pf is mlx_p1. Althrough the tc flower rule show in_hw
> 
> 
> I check with the driver  both mlx_pf0vf0 and peer(mlx_p1) install the tc rule 
> success
> 
> I think it's a problem of lag mode. Or I miss some configureation?
> 
> 
> BR
> 
> wenxu
> 
> 
> 
> 
> 

Hi,

we need to verify the driver detected to be in lag mode and
duplicated the offload rule to both eswitches.
do you see lag map messages in dmesg?
something like "lag map port 1:1 port 2:2"
this is to make sure the driver actually in lag mode.
in this mode a rule added to mlx_pf0vf0 will be added to esw of pf0 and esw of 
pf1.
then when u send a packet it could be handled in esw0 or esw1
if the rule is not in esw1 then it wont be offloaded when using pf1.

thanks,
Roi

Re: [patch net-next 3/7] mlxfw: Propagate error messages through extack

2019-05-23 Thread David Ahern

On 5/23/19 3:45 AM, Jiri Pirko wrote:
> @@ -57,11 +58,13 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev 
> *mlxfw_dev, u32 fwhandle,
>   if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) {
>   pr_err("Firmware flash failed: %s\n",
>  mlxfw_fsm_state_err_str[fsm_state_err]);
> + NL_SET_ERR_MSG_MOD(extack, "Firmware flash failed");
>   return -EINVAL;
>   }
>   if (curr_fsm_state != fsm_state) {
>   if (--times == 0) {
>   pr_err("Timeout reached on FSM state change");
> + NL_SET_ERR_MSG_MOD(extack, "Timeout reached on FSM 
> state change");

FSM? Is the meaning obvious to users?

>   return -ETIMEDOUT;
>   }
>   msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS);
> @@ -76,7 +79,8 @@ static int mlxfw_fsm_state_wait(struct mlxfw_dev 
> *mlxfw_dev, u32 fwhandle,
>  
>  static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev,
>u32 fwhandle,
> -  struct mlxfw_mfa2_component *comp)
> +  struct mlxfw_mfa2_component *comp,
> +  struct netlink_ext_ack *extack)
>  {
>   u16 comp_max_write_size;
>   u8 comp_align_bits;
> @@ -96,6 +100,7 @@ static int mlxfw_flash_component(struct mlxfw_dev 
> *mlxfw_dev,
>   if (comp->data_size > comp_max_size) {
>   pr_err("Component %d is of size %d which is bigger than limit 
> %d\n",
>  comp->index, comp->data_size, comp_max_size);
> + NL_SET_ERR_MSG_MOD(extack, "Component is which is bigger than 
> limit");

Need to drop 'is which'.


...

> @@ -156,6 +163,7 @@ static int mlxfw_flash_components(struct mlxfw_dev 
> *mlxfw_dev, u32 fwhandle,
> &component_count);
>   if (err) {
>   pr_err("Could not find device PSID in MFA2 file\n");
> + NL_SET_ERR_MSG_MOD(extack, "Could not find device PSID in MFA2 
> file");

same here, is PSID understood by user?

bonding-devel mail list?

2019-05-23 Thread Bill Carlson


Hello,

Noted the old bonding-devel mail list at sourceforge is no more, is 
there an alternate?


Chasing whether a bond of bonds has an issue my testing hasn't revealed.

Thanks,

--

Bill Carlson

Anything is possible, given Time and Money.

Re: [PATCH 6/8] dt-bindings: net: stmmac: Convert the binding to a schemas

2019-05-23 Thread Rob Herring

On Thu, May 23, 2019 at 4:57 AM Maxime Ripard  wrote:
>
> Switch the STMMAC / Synopsys DesignWare MAC controller binding to a YAML
> schema to enable the DT validation.

You picked an easy one. ;)

>
> Signed-off-by: Maxime Ripard 
> ---
>  Documentation/devicetree/bindings/net/snps,dwmac.yaml | 344 +++-
>  Documentation/devicetree/bindings/net/stmmac.txt  | 179 +--
>  2 files changed, 345 insertions(+), 178 deletions(-)
>  create mode 100644 Documentation/devicetree/bindings/net/snps,dwmac.yaml
>
> diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml 
> b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
> new file mode 100644
> index ..be3ada5121e1
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
> @@ -0,0 +1,344 @@
> +# SPDX-License-Identifier: GPL-2.0
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/snps,dwmac.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Synopsys DesignWare MAC Device Tree Bindings
> +
> +maintainers:
> +  - Alexandre Torgue 
> +  - Giuseppe Cavallaro 
> +  - Jose Abreu 
> +
> +properties:
> +  compatible:
> +oneOf:
> +  - const: snps,dwmac
> +  - const: snps,dwmac-3.50a
> +  - const: snps,dwmac-3.610
> +  - const: snps,dwmac-3.70a
> +  - const: snps,dwmac-3.710
> +  - const: snps,dwmac-4.00
> +  - const: snps,dwmac-4.10a
> +  - const: snps,dwxgmac
> +  - const: snps,dwxgmac-2.10
> +  - const: st,spear600-gmac
> +description: Deprecated

Like the other, just make this an enum.

Though, what to do on deprecated things? If we expect dts files to be
updated, then we should remove or disallow in the schema (e.g. 'prop:
false' for properties). The issue with updating dts files, is it may
break old kernels with new dtbs.

> +
> +  reg:
> +maxItems: 1
> +
> +  interrupts:
> +minItems: 1
> +maxItems: 3
> +items:
> +  - description: Combined signal for various interrupt events
> +  - description: The interrupt to manage the remote wake-up packet 
> detection
> +  - description: The interrupt that occurs when Rx exits the LPI state
> +
> +  interrupt-names:
> +minItems: 1
> +maxItems: 3
> +items:
> +  - const: macirq
> +  - const: eth_wake_irq
> +  - const: eth_lpi
> +
> +  clocks:
> +minItems: 1
> +maxItems: 3
> +items:
> +  - description: GMAC main clock
> +  - description: Peripheral registers interface clock
> +  - description:
> +  PTP reference clock. This clock is used for programming the
> +  Timestamp Addend Register. If not passed then the system
> +  clock will be used and this is fine on some platforms.
> +
> +  clock-names:
> +additionalItems: true

That's not ideal, but what I'd expect on this one.

> +contains:
> +  enum:
> +- stmmaceth
> +- pclk
> +- ptp_ref
> +
> +  resets:
> +maxItems: 1
> +description:
> +  MAC Reset signal.
> +
> +  reset-names:
> +const: stmmaceth
> +
> +  snps,axi-config:
> +$ref: /schemas/types.yaml#definitions/phandle
> +description:
> +  AXI BUS Mode parameters. Phandle to a node that can contain the
> +  following properties
> +* snps,lpi_en, enable Low Power Interface
> +* snps,xit_frm, unlock on WoL
> +* snps,wr_osr_lmt, max write outstanding req. limit
> +* snps,rd_osr_lmt, max read outstanding req. limit
> +* snps,kbbe, do not cross 1KiB boundary.
> +* snps,blen, this is a vector of supported burst length.
> +* snps,fb, fixed-burst
> +* snps,mb, mixed-burst
> +* snps,rb, rebuild INCRx Burst

This obviously needs its own schema, but that can come latter.

> +
> +  snps,mtl-rx-config:
> +$ref: /schemas/types.yaml#definitions/phandle
> +description:
> +  Multiple RX Queues parameters. Phandle to a node that can
> +  contain the following properties
> +* snps,rx-queues-to-use, number of RX queues to be used in the
> +  driver
> +* Choose one of these RX scheduling algorithms
> +  * snps,rx-sched-sp, Strict priority
> +  * snps,rx-sched-wsp, Weighted Strict priority
> +* For each RX queue
> +  * Choose one of these modes
> +* snps,dcb-algorithm, Queue to be enabled as DCB
> +* snps,avb-algorithm, Queue to be enabled as AVB
> +  * snps,map-to-dma-channel, Channel to map
> +  * Specifiy specific packet routing
> +* snps,route-avcp, AV Untagged Control packets
> +* snps,route-ptp, PTP Packets
> +* snps,route-dcbcp, DCB Control Packets
> +* snps,route-up, Untagged Packets
> +* snps,route-multi-broad, Multicast & Broadcast Packets
> +  * snps,priority, RX queue priority (Range 0x0 to 0xF)

This too.

> +
> +  snps,mtl-tx-config:
> +$ref: /schemas/types.yaml#definitions/phan

Re: [RFC] vsock: proposal to support multiple transports at runtime

2019-05-23 Thread Stefan Hajnoczi

On Tue, May 14, 2019 at 10:15:43AM +0200, Stefano Garzarella wrote:
> Hi guys,
> I'm currently interested on implement a multi-transport support for VSOCK in
> order to handle nested VMs.
> 
> As Stefan suggested me, I started to look at this discussion:
> https://lkml.org/lkml/2017/8/17/551
> Below I tried to summarize a proposal for a discussion, following the ideas
> from Dexuan, Jorgen, and Stefan.
> 
> 
> We can define two types of transport that we have to handle at the same time
> (e.g. in a nested VM we would have both types of transport running together):
> 
> - 'host side transport', it runs in the host and it is used to communicate 
> with
>   the guests of a specific hypervisor (KVM, VMWare or HyperV)
> 
>   Should we support multiple 'host side transport' running at the same time?
> 
> - 'guest side transport'. it runs in the guest and it is used to communicate
>   with the host transport

I find this terminology confusing.  Perhaps "host->guest" (your 'host
side transport') and "guest->host" (your 'guest side transport') is
clearer?

Or maybe the nested virtualization terminology of L2 transport (your
'host side transport') and L0 transport (your 'guest side transport')?
Here we are the L1 guest and L0 is the host and L2 is our nested guest.

> 
> 
> The main goal is to find a way to decide what transport use in these cases:
> 1. connect() / sendto()
> 
>   a. use the 'host side transport', if the destination is the guest
>  (dest_cid > VMADDR_CID_HOST).
>  If we want to support multiple 'host side transport' running at the
>  same time, we should assign CIDs uniquely across all transports.
>  In this way, a packet generated by the host side will get directed
>  to the appropriate transport based on the CID

The multiple host side transport case is unlikely to be necessary on x86
where only one hypervisor uses VMX at any given time.  But eventually it
may happen so it's wise to at least allow it in the design.

> 
>   b. use the 'guest side transport', if the destination is the host
>  (dest_cid == VMADDR_CID_HOST)

Makes sense to me.

> 
> 
> 2. listen() / recvfrom()
> 
>   a. use the 'host side transport', if the socket is bound to
>  VMADDR_CID_HOST, or it is bound to VMADDR_CID_ANY and there is no
>  guest transport.
>  We could also define a new VMADDR_CID_LISTEN_FROM_GUEST in order to
>  address this case.
>  If we want to support multiple 'host side transport' running at the
>  same time, we should find a way to allow an application to bound a
>  specific host transport (e.g. adding new VMADDR_CID_LISTEN_FROM_KVM,
>  VMADDR_CID_LISTEN_FROM_VMWARE, VMADDR_CID_LISTEN_FROM_HYPERV)

Hmm...VMADDR_CID_LISTEN_FROM_KVM, VMADDR_CID_LISTEN_FROM_VMWARE,
VMADDR_CID_LISTEN_FROM_HYPERV isn't very flexible.  What if my service
should only be available to a subset of VMware VMs?

Instead it might be more appropriate to use network namespaces to create
independent AF_VSOCK addressing domains.  Then you could have two
separate groups of VMware VMs and selectively listen to just one group.

> 
>   b. use the 'guest side transport', if the socket is bound to local CID
>  different from the VMADDR_CID_HOST (guest CID get with
>  IOCTL_VM_SOCKETS_GET_LOCAL_CID), or it is bound to VMADDR_CID_ANY
>  (to be backward compatible).
>  Also in this case, we could define a new VMADDR_CID_LISTEN_FROM_HOST.

Two additional topics:

1. How will loading af_vsock.ko change?  In particular, can an
   application create a socket in af_vsock.ko without any loaded
   transport?  Can it enter listen state without any loaded transport
   (this seems useful with VMADDR_CID_ANY)?

2. Does your proposed behavior match VMware's existing nested vsock
   semantics?

signature.asc
Description: PGP signature

Re: [PATCH bpf-next v2 1/3] bpf: implement bpf_send_signal() helper

2019-05-23 Thread Daniel Borkmann

On 05/22/2019 07:39 AM, Yonghong Song wrote:
> This patch tries to solve the following specific use case.
> 
> Currently, bpf program can already collect stack traces
> through kernel function get_perf_callchain()
> when certain events happens (e.g., cache miss counter or
> cpu clock counter overflows). But such stack traces are
> not enough for jitted programs, e.g., hhvm (jited php).
> To get real stack trace, jit engine internal data structures
> need to be traversed in order to get the real user functions.
> 
> bpf program itself may not be the best place to traverse
> the jit engine as the traversing logic could be complex and
> it is not a stable interface either.
> 
> Instead, hhvm implements a signal handler,
> e.g. for SIGALARM, and a set of program locations which
> it can dump stack traces. When it receives a signal, it will
> dump the stack in next such program location.
> 
> Such a mechanism can be implemented in the following way:
>   . a perf ring buffer is created between bpf program
> and tracing app.
>   . once a particular event happens, bpf program writes
> to the ring buffer and the tracing app gets notified.
>   . the tracing app sends a signal SIGALARM to the hhvm.
> 
> But this method could have large delays and causing profiling
> results skewed.
> 
> This patch implements bpf_send_signal() helper to send
> a signal to hhvm in real time, resulting in intended stack traces.
> 
> Signed-off-by: Yonghong Song 
> ---
>  include/uapi/linux/bpf.h | 17 +-
>  kernel/trace/bpf_trace.c | 67 
>  2 files changed, 83 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 63e0cf66f01a..68d4470523a0 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2672,6 +2672,20 @@ union bpf_attr {
>   *   0 on success.
>   *
>   *   **-ENOENT** if the bpf-local-storage cannot be found.
> + *
> + * int bpf_send_signal(u32 sig)
> + *   Description
> + *   Send signal *sig* to the current task.
> + *   Return
> + *   0 on success or successfully queued.
> + *
> + *   **-EBUSY** if work queue under nmi is full.
> + *
> + *   **-EINVAL** if *sig* is invalid.
> + *
> + *   **-EPERM** if no permission to send the *sig*.
> + *
> + *   **-EAGAIN** if bpf program can try again.
>   */
>  #define __BPF_FUNC_MAPPER(FN)\
>   FN(unspec), \
> @@ -2782,7 +2796,8 @@ union bpf_attr {
>   FN(strtol), \
>   FN(strtoul),\
>   FN(sk_storage_get), \
> - FN(sk_storage_delete),
> + FN(sk_storage_delete),  \
> + FN(send_signal),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f92d6ad5e080..f8cd0db7289f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -567,6 +567,58 @@ static const struct bpf_func_proto 
> bpf_probe_read_str_proto = {
>   .arg3_type  = ARG_ANYTHING,
>  };
>  
> +struct send_signal_irq_work {
> + struct irq_work irq_work;
> + u32 sig;
> +};
> +
> +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
> +
> +static void do_bpf_send_signal(struct irq_work *entry)
> +{
> + struct send_signal_irq_work *work;
> +
> + work = container_of(entry, struct send_signal_irq_work, irq_work);
> + group_send_sig_info(work->sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
> +}
> +
> +BPF_CALL_1(bpf_send_signal, u32, sig)
> +{
> + struct send_signal_irq_work *work = NULL;
> +
> + /* Similar to bpf_probe_write_user, task needs to be
> +  * in a sound condition and kernel memory access be
> +  * permitted in order to send signal to the current
> +  * task.
> +  */
> + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
> + return -EPERM;
> + if (unlikely(uaccess_kernel()))
> + return -EPERM;
> + if (unlikely(!nmi_uaccess_okay()))
> + return -EPERM;
> +
> + if (in_nmi()) {

Hm, bit confused, can't this only be done out of process context in
general since only there current points to e.g. hhvm? I'm probably
missing something. Could you elaborate?

> + work = this_cpu_ptr(&send_signal_work);
> + if (work->irq_work.flags & IRQ_WORK_BUSY)
> + return -EBUSY;
> +
> + work->sig = sig;
> + irq_work_queue(&work->irq_work);
> + return 0;
> + }
> +
> + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
> +

Nit: extra newline slipped in

> +}
> +
> +static const struct bpf_func_proto bpf_send_signal_proto = {
> + .func   = bpf_send_signal,
> + .gpl_only   = false,
> + .ret_type   = RET_INTEGER,
> +

Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names

2019-05-23 Thread David Ahern

On 5/23/19 1:58 AM, Stefano Brivio wrote:
> Hi David,
> 
> On Wed, 22 May 2019 12:11:06 -0700
> David Ahern  wrote:
> 
>> From: David Ahern 
>>
>> The point of the pause-on-fail argument is to leave the setup as is after
>> a test fails to allow a user to debug why it failed. Move the cleanup
>> after posting the result to the user to make it so.
>>
>> Random names for the namespaces are not user friendly when trying to
>> debug a failure. Make them simpler and more direct for the tests. Run
>> cleanup at the beginning to ensure they are cleaned up if they already
>> exist.
> 
> The reasons for picking per-instance unique names were:
> 
> - one can run multiple instances of the script in parallel. I
>   couldn't trigger any bug this way *so far*, though
> 
> - cleanup might fail because of e.g. device reference count leaks (this
>   happened quite frequently in the past), which are anyway visible in
>   kernel logs. Unique names avoid the need to reboot
> 
> Sure, it's a trade-off with usability, and I also see the value of
> having fixed names, so I'm fine with this too. I just wanted to make
> sure you considered these points.
> 
> By the way, the comment to nsname() (that I would keep, it's still
> somewhat convenient) is now inconsistent.
> 

I have been using the namespace override for a while now. I did consider
impacts to the above, but my thinking is this: exceptions are per FIB
entry (per fib6_nh with my latest patch set, but point still holds), FIB
entries are per FIB table, FIB tables are per network namespace. Running
multiple pmtu.sh sessions in parallel can not trigger an interdependent
bug because of that separation. The cleanup within a namespace teardown
(reference count leaks) should not be affected.

Now that we have good set of functional tests, we do need more complex
tests but those will still be contained within the namespace separation.
If you look at my current patch set on the list I add an icmp_redirect
test script. It actually does redirect, verify, mtu on top of redirect,
verify and then resets and inverts the order - going after an exception
entry with an update for both use cases.

For the pmtu script, perhaps the next step is something as simple as
configuring the setup and routing once and then run all of the
individual tests (or multiple of them) to generate multiple exceptions
within a single FIB table and then tests to generate multiple exceptions
with different addresses per entry.

[PATCH] bpf: sockmap, fix use after free from sleep in psock backlog workqueue

2019-05-23 Thread John Fastabend

Backlog work for psock (sk_psock_backlog) might sleep while waiting
for memory to free up when sending packets. However, while sleeping
the socket may be closed and removed from the map by the user space
side.

This breaks an assumption in sk_stream_wait_memory, which expects the
wait queue to be still there when it wakes up resulting in a
use-after-free shown below. To fix his mark sendmsg as MSG_DONTWAIT
to avoid the sleep altogether. We already set the flag for the
sendpage case but we missed the case were sendmsg is used.
Sockmap is currently the only user of skb_send_sock_locked() so only
the sockmap paths should be impacted.

==
BUG: KASAN: use-after-free in remove_wait_queue+0x31/0x70
Write of size 8 at addr 888069a0c4e8 by task kworker/0:2/110

CPU: 0 PID: 110 Comm: kworker/0:2 Not tainted 
5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 
04/01/2014
Workqueue: events sk_psock_backlog
Call Trace:
 print_address_description+0x6e/0x2b0
 ? remove_wait_queue+0x31/0x70
 kasan_report+0xfd/0x177
 ? remove_wait_queue+0x31/0x70
 ? remove_wait_queue+0x31/0x70
 remove_wait_queue+0x31/0x70
 sk_stream_wait_memory+0x4dd/0x5f0
 ? sk_stream_wait_close+0x1b0/0x1b0
 ? wait_woken+0xc0/0xc0
 ? tcp_current_mss+0xc5/0x110
 tcp_sendmsg_locked+0x634/0x15d0
 ? tcp_set_state+0x2e0/0x2e0
 ? __kasan_slab_free+0x1d1/0x230
 ? kmem_cache_free+0x70/0x140
 ? sk_psock_backlog+0x40c/0x4b0
 ? process_one_work+0x40b/0x660
 ? worker_thread+0x82/0x680
 ? kthread+0x1b9/0x1e0
 ? ret_from_fork+0x1f/0x30
 ? check_preempt_curr+0xaf/0x130
 ? iov_iter_kvec+0x5f/0x70
 ? kernel_sendmsg_locked+0xa0/0xe0
 skb_send_sock_locked+0x273/0x3c0
 ? skb_splice_bits+0x180/0x180
 ? start_thread+0xe0/0xe0
 ? update_min_vruntime.constprop.27+0x88/0xc0
 sk_psock_backlog+0xb3/0x4b0
 ? strscpy+0xbf/0x1e0
 process_one_work+0x40b/0x660
 worker_thread+0x82/0x680
 ? process_one_work+0x660/0x660
 kthread+0x1b9/0x1e0
 ? __kthread_create_on_node+0x250/0x250
 ret_from_fork+0x1f/0x30

Fixes: 20bf50de3028c ("skbuff: Function to send an skbuf on a socket")
Reported-by: Jakub Sitnicki 
Tested-by: Jakub Sitnicki 
Signed-off-by: John Fastabend 
---
 net/core/skbuff.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e89be62..c3b03c5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2337,6 +2337,7 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff 
*skb, int offset,
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
+   msg.flags = MSG_DONTWAIT;
 
ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
if (ret <= 0)

Re: [PATCH 7/8] dt-bindings: net: sun7i-gmac: Convert the binding to a schemas

2019-05-23 Thread Rob Herring

On Thu, May 23, 2019 at 4:57 AM Maxime Ripard  wrote:
>
> Switch our Allwinner A20 GMAC controller binding to a YAML schema to enable
> the DT validation. Since that controller is based on a Synopsys IP, let's
> add the validation to that schemas with a bunch of conditionals.
>
> Signed-off-by: Maxime Ripard 
> ---
>  Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt | 27 
> ---
>  Documentation/devicetree/bindings/net/snps,dwmac.yaml  | 45 
> +
>  2 files changed, 45 insertions(+), 27 deletions(-)
>  delete mode 100644 
> Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt

I think it would be better to keep these as separate files and either
include snps,dwmac.yaml from it or only add the compatible to
snps,dwmac.yaml.

Rob

Re: [PATCH net] sk_msg: Keep reference on socket file while psock lives

2019-05-23 Thread John Fastabend

[...]

> 
> Thanks for taking a look at it. Setting MSG_DONTWAIT works great for
> me. No more crashes in sk_stream_wait_memory. I've tested it on top of
> current bpf-next (f49aa1de9836). Here's my:
> 
>   Tested-by: Jakub Sitnicki 
> 
> The actual I've tested is below, for completeness.
> 
> BTW. I've ran into another crash which I haven't seen before while
> testing sockmap-echo, but it looks unrelated:
> 
>   https://lore.kernel.org/netdev/20190522100142.28925-1-ja...@cloudflare.com/
> 
> -Jakub
> 
> --- 8< ---
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index e89be6282693..4a7c656b195b 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -2337,6 +2337,7 @@ int skb_send_sock_locked(struct sock *sk, struct 
> sk_buff *skb, int offset,
> kv.iov_base = skb->data + offset;
> kv.iov_len = slen;
> memset(&msg, 0, sizeof(msg));
> +   msg.msg_flags = MSG_DONTWAIT;
> 
> ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
> if (ret <= 0)

I went ahead and submitted this feel free to add your signed-off-by.

Thanks,
John

Re: [PATCH bpf-next v2 1/3] bpf: implement bpf_send_signal() helper

2019-05-23 Thread Yonghong Song



On 5/23/19 8:41 AM, Daniel Borkmann wrote:
> On 05/22/2019 07:39 AM, Yonghong Song wrote:
>> This patch tries to solve the following specific use case.
>>
>> Currently, bpf program can already collect stack traces
>> through kernel function get_perf_callchain()
>> when certain events happens (e.g., cache miss counter or
>> cpu clock counter overflows). But such stack traces are
>> not enough for jitted programs, e.g., hhvm (jited php).
>> To get real stack trace, jit engine internal data structures
>> need to be traversed in order to get the real user functions.
>>
>> bpf program itself may not be the best place to traverse
>> the jit engine as the traversing logic could be complex and
>> it is not a stable interface either.
>>
>> Instead, hhvm implements a signal handler,
>> e.g. for SIGALARM, and a set of program locations which
>> it can dump stack traces. When it receives a signal, it will
>> dump the stack in next such program location.
>>
>> Such a mechanism can be implemented in the following way:
>>. a perf ring buffer is created between bpf program
>>  and tracing app.
>>. once a particular event happens, bpf program writes
>>  to the ring buffer and the tracing app gets notified.
>>. the tracing app sends a signal SIGALARM to the hhvm.
>>
>> But this method could have large delays and causing profiling
>> results skewed.
>>
>> This patch implements bpf_send_signal() helper to send
>> a signal to hhvm in real time, resulting in intended stack traces.
>>
>> Signed-off-by: Yonghong Song 
>> ---
>>   include/uapi/linux/bpf.h | 17 +-
>>   kernel/trace/bpf_trace.c | 67 
>>   2 files changed, 83 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 63e0cf66f01a..68d4470523a0 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -2672,6 +2672,20 @@ union bpf_attr {
>>* 0 on success.
>>*
>>* **-ENOENT** if the bpf-local-storage cannot be found.
>> + *
>> + * int bpf_send_signal(u32 sig)
>> + *  Description
>> + *  Send signal *sig* to the current task.
>> + *  Return
>> + *  0 on success or successfully queued.
>> + *
>> + *  **-EBUSY** if work queue under nmi is full.
>> + *
>> + *  **-EINVAL** if *sig* is invalid.
>> + *
>> + *  **-EPERM** if no permission to send the *sig*.
>> + *
>> + *  **-EAGAIN** if bpf program can try again.
>>*/
>>   #define __BPF_FUNC_MAPPER(FN)  \
>>  FN(unspec), \
>> @@ -2782,7 +2796,8 @@ union bpf_attr {
>>  FN(strtol), \
>>  FN(strtoul),\
>>  FN(sk_storage_get), \
>> -FN(sk_storage_delete),
>> +FN(sk_storage_delete),  \
>> +FN(send_signal),
>>   
>>   /* integer value in 'imm' field of BPF_CALL instruction selects which 
>> helper
>>* function eBPF program intends to call
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index f92d6ad5e080..f8cd0db7289f 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -567,6 +567,58 @@ static const struct bpf_func_proto 
>> bpf_probe_read_str_proto = {
>>  .arg3_type  = ARG_ANYTHING,
>>   };
>>   
>> +struct send_signal_irq_work {
>> +struct irq_work irq_work;
>> +u32 sig;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
>> +
>> +static void do_bpf_send_signal(struct irq_work *entry)
>> +{
>> +struct send_signal_irq_work *work;
>> +
>> +work = container_of(entry, struct send_signal_irq_work, irq_work);
>> +group_send_sig_info(work->sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
>> +}
>> +
>> +BPF_CALL_1(bpf_send_signal, u32, sig)
>> +{
>> +struct send_signal_irq_work *work = NULL;
>> +
>> +/* Similar to bpf_probe_write_user, task needs to be
>> + * in a sound condition and kernel memory access be
>> + * permitted in order to send signal to the current
>> + * task.
>> + */
>> +if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
>> +return -EPERM;
>> +if (unlikely(uaccess_kernel()))
>> +return -EPERM;
>> +if (unlikely(!nmi_uaccess_okay()))
>> +return -EPERM;
>> +
>> +if (in_nmi()) {
> 
> Hm, bit confused, can't this only be done out of process context in
> general since only there current points to e.g. hhvm? I'm probably
> missing something. Could you elaborate?

That is true. If in nmi, it is out of process context and in nmi 
context, we use an irq_work here since group_send_sig_info() has
spinlock inside. The bpf program (e.g., a perf_event program) needs to 
check it is with right current (e.g., by pid) before calling
this helper.

Does this address your question?

> 
>> +work = this_cpu_ptr(&send_signal_work);
>> +if (work->irq_work.flags & IRQ_WORK_BUSY)
>> +

Re: [PATCH bpf-next 06/12] selftests/bpf: add tests for libbpf's hashmap

2019-05-23 Thread Stanislav Fomichev

On 05/22, Andrii Nakryiko wrote:
> On Wed, May 22, 2019 at 3:15 PM Andrii Nakryiko
>  wrote:
> >
> > On Wed, May 22, 2019 at 1:31 PM Stanislav Fomichev  wrote:
> > >
> > > On 05/22, Andrii Nakryiko wrote:
> > > > Test all APIs for internal hashmap implementation.
> > > >
> > > > Signed-off-by: Andrii Nakryiko 
> > > > ---
> > > >  tools/testing/selftests/bpf/.gitignore |   1 +
> > > >  tools/testing/selftests/bpf/Makefile   |   2 +-
> > > >  tools/testing/selftests/bpf/test_hashmap.c | 382 +
> > > >  3 files changed, 384 insertions(+), 1 deletion(-)
> > > >  create mode 100644 tools/testing/selftests/bpf/test_hashmap.c
> > > >
> > > > diff --git a/tools/testing/selftests/bpf/.gitignore 
> > > > b/tools/testing/selftests/bpf/.gitignore
> > > > index dd5d69529382..138b6c063916 100644
> > > > --- a/tools/testing/selftests/bpf/.gitignore
> > > > +++ b/tools/testing/selftests/bpf/.gitignore
> > > > @@ -35,3 +35,4 @@ test_sysctl
> > > >  alu32
> > > >  libbpf.pc
> > > >  libbpf.so.*
> > > > +test_hashmap
> > > > diff --git a/tools/testing/selftests/bpf/Makefile 
> > > > b/tools/testing/selftests/bpf/Makefile
> > > > index 66f2dca1dee1..ddae06498a00 100644
> > > > --- a/tools/testing/selftests/bpf/Makefile
> > > > +++ b/tools/testing/selftests/bpf/Makefile
> > > > @@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
> > > > test_lru_map test_lpm_map test
> > > >   test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
> > > >   test_sock test_btf test_sockmap test_lirc_mode2_user 
> > > > get_cgroup_id_user \
> > > >   test_socket_cookie test_cgroup_storage test_select_reuseport 
> > > > test_section_names \
> > > > - test_netcnt test_tcpnotify_user test_sock_fields test_sysctl
> > > > + test_netcnt test_tcpnotify_user test_sock_fields test_sysctl 
> > > > test_hashmap
> > > >
> > > >  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> > > >  TEST_GEN_FILES = $(BPF_OBJ_FILES)
> > > > diff --git a/tools/testing/selftests/bpf/test_hashmap.c 
> > > > b/tools/testing/selftests/bpf/test_hashmap.c
> > > > new file mode 100644
> > > > index ..b64094c981e3
> > > > --- /dev/null
> > > [..]
> > > > +++ b/tools/testing/selftests/bpf/test_hashmap.c
> > > Any reason against putting it in bpf/prog_tests? No need for your own
> > > CHECK macro and no Makefile changes.
> >
> > I didn't know they are special and I assumed they are mostly for
> > loading BPF programs and testing them. I'll check that set up and will
> > move there, if it makes sense.
> 
> Seems like my assumption was right. I converted these hashmap tests to
> progs_test, and each CHECK usage emits either SUCCESS or FAIL message,
> spamming output like this:
> 
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> test_hashmap_generic:PASS:test_hashmap_generic 0 nsec
> 
> 
> CHECK also expects __u32 duration to be in the scope, so I need to add
> it just to make macro happy.
> 
> It feels like at the moment this "framework" is not yet well-suited
> for generic tests, but more towards loading BPF program and checking
> (once per BPF program) that everything went fine. I'll keep it as is
> for now. It won't be hard to convert it into prog_tests, once we
> develop it a bit more.
Oh, yeah, it's mostly for the tests which do bpf_prog_test_run.
Might not be a good fit if you're not loading any bpf progs, agreed.

> > > > @@ -0,0 +1,382 @@
> > > > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> > > > +
> > > > +/*
> > > > + * Tests for libbpf's hashmap.
> > > > + *
> > > > + * Copyright (c) 2019 Facebook
> > > > + */
> > > > +#include 
> > > > +#include 
> > > > +#include 
> > > > +#include "hashmap.h"
> > > > +
> > > > +#define CHECK(condition, format...) ({ 
> > > >   \
> > > > + int __ret = !!(condition);  \
> > > > + if (__ret) {\
> > > > + fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__); \
> > > > + fprintf(stderr, format);\
> > > > + }   \
> > > > + __ret;  \
> > > > +})
> > > > +
> > > > +size_t hash_fn(const void *k, void *ctx)
> > > > +{
> > > > + return (long)k;
> > > > +}
> > > > +
> > > > +bool equal_fn(const void *a, const void *b, void *ctx)
> > > > +{
> > > > + return (long)a ==

Re: [PATCH bpf] selftests: bpf: add zero extend checks for ALU32 and/or/xor

2019-05-23 Thread Björn Töpel

On Thu, 23 May 2019 at 16:31, Jiong Wang  wrote:
>
>
> > On 23 May 2019, at 15:02, Daniel Borkmann  wrote:
> >
> > On 05/23/2019 08:38 AM, Y Song wrote:
> >> On Wed, May 22, 2019 at 1:46 PM Björn Töpel  wrote:
> >>> On Wed, 22 May 2019 at 20:13, Y Song  wrote:
>  On Wed, May 22, 2019 at 2:25 AM Björn Töpel  
>  wrote:
> >
> > Add three tests to test_verifier/basic_instr that make sure that the
> > high 32-bits of the destination register is cleared after an ALU32
> > and/or/xor.
> >
> > Signed-off-by: Björn Töpel 
> 
>  I think the patch intends for bpf-next, right? The patch itself looks
>  good to me.
>  Acked-by: Yonghong Song 
> >>>
> >>> Thank you. Actually, it was intended for the bpf tree, as a test
> >>> follow up for this [1] fix.
> >> Then maybe you want to add a Fixes tag and resubmit?
> >
> > Why would the test case need a fixes tag? It's common practice that we have
> > BPF fixes that we queue to bpf tree along with kselftest test cases related
> > to them. Therefore, applied as well, thanks for following up!
> >
> > Björn, in my email from the fix, I mentioned we should have test snippets
> > ideally for all of the alu32 insns to not miss something falling through the
> > cracks when JITs get added or changed. If you have some cycles to add the
> > remaining missing ones, that would be much appreciated.
>
> Björn,
>
>   If you don’t have time, I can take this alu32 test case follow up as well.
>

Jiong, that would be great. Thank you. I'd guess it would take much
longer for me to do it (gmail.com time vs intel.com time ;-)).


Björn

> Regards,
> Jiong
>
> >
> > Thanks,
> > Daniel
>

[PATCH net-next v4] net: sched: Introduce act_ctinfo action

2019-05-23 Thread Kevin 'ldir' Darbyshire-Bryant

ctinfo is a new tc filter action module.  It is designed to restore
information contained in conntrack marks to other places.  At present it
can restore DSCP values to IPv4/6 diffserv fields and also copy
conntrack marks to skb marks.  As such the 2nd function effectively
replaces the existing act_connmark module

The DSCP restoration is intended for use and has been found useful for
restoring ingress classifications based on egress classifications across
links that bleach or otherwise change DSCP, typically home ISP Internet
links.  Restoring DSCP on ingress on the WAN link allows qdiscs such as
CAKE to shape inbound packets according to policies that are easier to
indicate on egress.

Ingress classification is traditionally a challenging task since
iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
lookups, hence are unable to see internal IPv4 addresses as used on the
typical home masquerading gateway.

ctinfo understands the following parameters:

dscp dscpmask[/statemask]

dscpmask - a 32 bit mask of at least 6 contiguous bits and indicates
where ctinfo will find the DSCP bits stored in the conntrack mark.

statemask - a 32 bit mask of (usually) 1 bit length, outside the area
specified by dscpmask.  This represents a conditional operation flag
whereby the DSCP is only restored if the flag is set.  This is useful to
implement a 'one shot' iptables based classification where the
'complicated' iptables rules are only run once to classify the
connection on initial (egress) packet and subsequent packets are all
marked/restored with the same DSCP.  A mask of zero disables the
conditional behaviour ie. the conntrack mark DSCP bits are always
restored to the ip diffserv field (assuming the conntrack entry is found
& the skb is an ipv4/ipv6 type)

mark [mask]

mark - enables copying the conntrack connmark value to the skb mark

mask - a 32 bit mask applied to the mark to mask out bit unwanted for
restoration.  The CAKE qdisc for example understands both DSCP and 'tin'
classification stored the mark, thus act_ctinfo may be used to restore
both aspects of classification for CAKE in one action.  A default mask
of 0x is applied if not specified.

zone - conntrack zone

control - action related control (reclassify | pipe | drop | continue |
ok | goto chain )

e.g. dscp 0xfc00/0x0100

|0xFCconntrack mark00---|
| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0|
| DSCP   | unused | flag  |unused   |
|---0x01---00---|
  |   |
  |   |
  ---| Conditional flag
 v only restore if set
|-ip diffserv-|
| 6 bits  |
|-|

e.g. mark 0x00ff

|0x00conntrack markff---|
| Bits 31-24 |  |
| DSCP & flag|  |
|---|
|
|
v
|skb mark---|
|   |
|   |
|---|

Signed-off-by: Kevin Darbyshire-Bryant 
---
v2 - add equivalent connmark functionality with an enhancement
 to accept a mask
 pass statistics for each sub-function as individual netlink
 attributes and stop (ab)using overlimits, drops
 update the testing config correctly
v3 - fix a licensing silly & tidy up GPL boilerplate
v4 - drop stray copy paste inline
 reverse christmas tree local vars

 include/net/tc_act/tc_ctinfo.h|  28 ++
 include/uapi/linux/pkt_cls.h  |   1 +
 include/uapi/linux/tc_act/tc_ctinfo.h |  43 +++
 net/sched/Kconfig |  17 +
 net/sched/Makefile|   1 +
 net/sched/act_ctinfo.c| 402 ++
 tools/testing/selftests/tc-testing/config |   1 +
 7 files changed, 493 insertions(+)
 create mode 100644 include/net/tc_act/tc_ctinfo.h
 create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h
 create mode 100644 net/sched/act_ctinfo.c

diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
new file mode 100644
index ..87334120dcb6
--- /dev/null
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CTINFO_H
+#define __NET_TC_CTINFO_H
+
+#include 
+
+struct tcf_ctinfo_params {
+   struct net *net;
+   u32 dscpmask;
+   u32 dscpstatemask;
+   u32 markmask;
+   u16 zone;
+   u8 mode;
+   u8 dscpmaskshift;
+   struct rcu_head rcu;
+};
+
+struct tcf_ctinfo {
+   struct tc_action common;
+   struct tcf_ctinfo_params __rcu *params;
+   u64 stats_dscp_set;
+   u64 stats_dscp_error;
+   u64 stats_mark_set;
+};
+
+#define to_ctinfo(a) ((struct tcf_ctinfo *)a)
+
+#endif /* __NET_TC_CTINFO_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt

Re: [RESEND][PATCH] Fix MACsec kernel panics, oopses and bugs

2019-05-23 Thread David Miller

From: Andreas Steinmetz 
Date: Thu, 23 May 2019 09:46:15 +0200

> MACsec causes oopses followed by a kernel panic when attached directly or 
> indirectly to a bridge. It causes erroneous
> checksum messages when attached to vxlan. When I did investigate I did find 
> skb leaks, apparent skb mis-handling and
> superfluous code. The attached patch fixes all MACsec misbehaviour I could 
> find. As I am no kernel developer somebody
> with sufficient kernel network knowledge should verify and correct the patch 
> where necessary.
> 
> Signed-off-by: Andreas Steinmetz 

Subject lines should be of the form:

[PATCH $DST_TREE] $subsystem_prefix: Description.

Where $DST_TREE here would be "net" and $subsystem_prefix would be "macsec".

> + /* FIXME: any better way to prevent calls to netdev_rx_csum_fault? */
> + skb->csum_complete_sw = 1;

Create a helper for this in linux/skbuff.h with very clear and clean comments
explaining what is going on.

Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics

2019-05-23 Thread Jakub Kicinski

On Thu, 23 May 2019 09:19:49 -0400, Jamal Hadi Salim wrote:
> On 2019-05-22 6:20 p.m., Jakub Kicinski wrote:
> > On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote:  
> >> * removed RFC tags  
> > 
> > Why?  There is still no upstream user for this (my previous
> > objections of this being only partially correct aside).  
> 
> 
> IIRC your point was to get the dumping to work with RTM_GETACTION.

Yup.

> That would still work here, no? There will be some latency
> based on the frequency of hardware->kernel stats updates.

I don't think so, I think the stats are only updated on classifier
dumps in Ed's code.  But we can't be 100% sure without seeing driver
code.

Re: [patch net-next v2] devlink: add warning in case driver does not set port type

2019-05-23 Thread David Miller

From: Jiri Pirko 
Date: Thu, 23 May 2019 10:43:35 +0200

> From: Jiri Pirko 
> 
> Prevent misbehavior of drivers who would not set port type for longer
> period of time. Drivers should always set port type. Do WARN if that
> happens.
> 
> Note that it is perfectly fine to temporarily not have the type set,
> during initialization and port type change.
> 
> Signed-off-by: Jiri Pirko 
> ---
> v1->v2:
> - Don't warn on DSA and CPU ports.

Applied.

Re: [PATCH v7 bpf-next 01/16] bpf: verifier: mark verified-insn with sub-register zext flag

2019-05-23 Thread Alexei Starovoitov

On Thu, May 23, 2019 at 03:28:15PM +0100, Jiong Wang wrote:
> 
> > On 23 May 2019, at 03:07, Alexei Starovoitov  
> > wrote:
> > 
> > On Wed, May 22, 2019 at 07:54:57PM +0100, Jiong Wang wrote:
> >> eBPF ISA specification requires high 32-bit cleared when low 32-bit
> >> sub-register is written. This applies to destination register of ALU32 etc.
> >> JIT back-ends must guarantee this semantic when doing code-gen. x86_64 and
> >> AArch64 ISA has the same semantics, so the corresponding JIT back-end
> >> doesn't need to do extra work.
> >> 
> >> However, 32-bit arches (arm, x86, nfp etc.) and some other 64-bit arches
> >> (PowerPC, SPARC etc) need to do explicit zero extension to meet this
> >> requirement, otherwise code like the following will fail.
> >> 
> >>  u64_value = (u64) u32_value
> >>  ... other uses of u64_value
> >> 
> >> This is because compiler could exploit the semantic described above and
> >> save those zero extensions for extending u32_value to u64_value, these JIT
> >> back-ends are expected to guarantee this through inserting extra zero
> >> extensions which however could be a significant increase on the code size.
> >> Some benchmarks show there could be ~40% sub-register writes out of total
> >> insns, meaning at least ~40% extra code-gen.
> >> 
> >> One observation is these extra zero extensions are not always necessary.
> >> Take above code snippet for example, it is possible u32_value will never be
> >> casted into a u64, the value of high 32-bit of u32_value then could be
> >> ignored and extra zero extension could be eliminated.
> >> 
> >> This patch implements this idea, insns defining sub-registers will be
> >> marked when the high 32-bit of the defined sub-register matters. For
> >> those unmarked insns, it is safe to eliminate high 32-bit clearnace for
> >> them.
> >> 
> >> Algo:
> >> - Split read flags into READ32 and READ64.
> >> 
> >> - Record index of insn that does sub-register write. Keep the index inside
> >>   reg state and update it during verifier insn walking.
> >> 
> >> - A full register read on a sub-register marks its definition insn as
> >>   needing zero extension on dst register.
> >> 
> >>   A new sub-register write overrides the old one.
> >> 
> >> - When propagating read64 during path pruning, also mark any insn defining
> >>   a sub-register that is read in the pruned path as full-register.
> >> 
> >> Reviewed-by: Jakub Kicinski 
> >> Signed-off-by: Jiong Wang 
> >> ---
> >> include/linux/bpf_verifier.h |  14 +++-
> >> kernel/bpf/verifier.c| 175 
> >> +++
> >> 2 files changed, 173 insertions(+), 16 deletions(-)
> >> 
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index 1305ccb..60fb54e 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -36,9 +36,11 @@
> >>  */
> >> enum bpf_reg_liveness {
> >>REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
> >> -  REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
> >> -  REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
> >> -  REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore 
> >> */
> >> +  REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial 
> >> value */
> >> +  REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
> >> +  REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
> >> +  REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later 
> >> reads */
> >> +  REG_LIVE_DONE = 0x8, /* liveness won't be updating this register 
> >> anymore */
> >> };
> >> 
> >> struct bpf_reg_state {
> >> @@ -131,6 +133,11 @@ struct bpf_reg_state {
> >> * pointing to bpf_func_state.
> >> */
> >>u32 frameno;
> >> +  /* Tracks subreg definition. The stored value is the insn_idx of the
> >> +   * writing insn. This is safe because subreg_def is used before any insn
> >> +   * patching which only happens after main verification finished.
> >> +   */
> >> +  s32 subreg_def;
> >>enum bpf_reg_liveness live;
> >> };
> >> 
> >> @@ -232,6 +239,7 @@ struct bpf_insn_aux_data {
> >>int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
> >>int sanitize_stack_off; /* stack slot to be cleared */
> >>bool seen; /* this insn was processed by the verifier */
> >> +  bool zext_dst; /* this insn zero extends dst reg */
> >>u8 alu_state; /* used in combination with alu_limit */
> >>unsigned int orig_idx; /* original instruction index */
> >> };
> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index 95f93544..0efccf8 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -981,6 +981,7 @@ static void mark_reg_not_init(struct bpf_verifier_env 
> >> *env,
> >>__mark_reg_not_init(regs + regno);
> >> }
> >> 
> >> +#define DEF_NOT_SUBREG(-1)
> >> static void init_reg_state(struct bpf_verifier_env *env,

Re: [PATCH net-next] selftests: pmtu: Simplify cleanup and namespace names

2019-05-23 Thread Stefano Brivio

On Thu, 23 May 2019 09:41:59 -0600
David Ahern  wrote:

> I have been using the namespace override for a while now. I did consider
> impacts to the above, but my thinking is this: exceptions are per FIB
> entry (per fib6_nh with my latest patch set, but point still holds), FIB
> entries are per FIB table, FIB tables are per network namespace. Running
> multiple pmtu.sh sessions in parallel can not trigger an interdependent
> bug because of that separation. The cleanup within a namespace teardown
> (reference count leaks) should not be affected.

I see, I guess it makes sense.

> Now that we have good set of functional tests, we do need more complex
> tests but those will still be contained within the namespace separation.
> If you look at my current patch set on the list I add an icmp_redirect
> test script. It actually does redirect, verify, mtu on top of redirect,
> verify and then resets and inverts the order - going after an exception
> entry with an update for both use cases.
> 
> For the pmtu script, perhaps the next step is something as simple as
> configuring the setup and routing once and then run all of the
> individual tests (or multiple of them) to generate multiple exceptions
> within a single FIB table and then tests to generate multiple exceptions
> with different addresses per entry.

I think, especially given your new icmp_redirect test script, that
another sensible next step would be turning the setup part in pmtu.sh
into some kind of library (also including the VRF setup) that could be
sourced from both scripts. Right now, that looks like a lot of
duplication.

-- 
Stefano

Re: [PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level

2019-05-23 Thread Yonghong Song



On 5/23/19 3:54 AM, Quentin Monnet wrote:
> libbpf was recently made aware of the log_level attribute for programs,
> used to specify the level of information expected to be dumped by the
> verifier.
> 
> Create an API function to pass additional attributes when loading a
> bpf_object, so we can set this log_level value in programs when loading
> them, and so that so that applications relying on libbpf but not calling
"so that so that" => "so that"
> bpf_prog_load_xattr() can also use that feature.

Do not fully understand the above statement. From the code below,
I did not see how the non-zero log_level can be set for bpf_program
without bpf_prog_load_xattr(). Maybe I miss something?

> 
> v2:
> - We are in a new cycle, bump libbpf extraversion number.
> 
> Signed-off-by: Quentin Monnet 
> Reviewed-by: Jakub Kicinski 
> ---
>   tools/lib/bpf/Makefile   |  2 +-
>   tools/lib/bpf/libbpf.c   | 20 +---
>   tools/lib/bpf/libbpf.h   |  6 ++
>   tools/lib/bpf/libbpf.map |  5 +
>   4 files changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
> index a2aceadf68db..9312066a1ae3 100644
> --- a/tools/lib/bpf/Makefile
> +++ b/tools/lib/bpf/Makefile
> @@ -3,7 +3,7 @@
>   
>   BPF_VERSION = 0
>   BPF_PATCHLEVEL = 0
> -BPF_EXTRAVERSION = 3
> +BPF_EXTRAVERSION = 4
>   
>   MAKEFLAGS += --no-print-directory
>   
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 197b574406b3..1c6fb7a3201e 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -,7 +,7 @@ static bool bpf_program__is_function_storage(struct 
> bpf_program *prog,
>   }
>   
>   static int
> -bpf_object__load_progs(struct bpf_object *obj)
> +bpf_object__load_progs(struct bpf_object *obj, int log_level)
>   {
>   size_t i;
>   int err;
> @@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj)
>   for (i = 0; i < obj->nr_programs; i++) {
>   if (bpf_program__is_function_storage(&obj->programs[i], obj))
>   continue;
> + obj->programs[i].log_level = log_level;
>   err = bpf_program__load(&obj->programs[i],
>   obj->license,
>   obj->kern_version);
> @@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj)
>   return 0;
>   }
>   
> -int bpf_object__load(struct bpf_object *obj)
> +int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
>   {
> + struct bpf_object *obj;
>   int err;
>   
> + if (!attr)
> + return -EINVAL;
> + obj = attr->obj;
>   if (!obj)
>   return -EINVAL;
>   
> @@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj)
>   
>   CHECK_ERR(bpf_object__create_maps(obj), err, out);
>   CHECK_ERR(bpf_object__relocate(obj), err, out);
> - CHECK_ERR(bpf_object__load_progs(obj), err, out);
> + CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
>   
>   return 0;
>   out:
> @@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj)
>   return err;
>   }
>   
> +int bpf_object__load(struct bpf_object *obj)
> +{
> + struct bpf_object_load_attr attr = {
> + .obj = obj,
> + };
> +
> + return bpf_object__load_xattr(&attr);
> +}
> +
>   static int check_path(const char *path)
>   {
>   char *cp, errmsg[STRERR_BUFSIZE];
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index c5ff00515ce7..e1c748db44f6 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct 
> bpf_object *obj,
>   LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path);
>   LIBBPF_API void bpf_object__close(struct bpf_object *object);
>   
> +struct bpf_object_load_attr {
> + struct bpf_object *obj;
> + int log_level;
> +};
> +
>   /* Load/unload object into/from kernel */
>   LIBBPF_API int bpf_object__load(struct bpf_object *obj);
> +LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr);
>   LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
>   LIBBPF_API const char *bpf_object__name(struct bpf_object *obj);
>   LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj);
> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> index 673001787cba..6ce61fa0baf3 100644
> --- a/tools/lib/bpf/libbpf.map
> +++ b/tools/lib/bpf/libbpf.map
> @@ -164,3 +164,8 @@ LIBBPF_0.0.3 {
>   bpf_map_freeze;
>   btf__finalize_data;
>   } LIBBPF_0.0.2;
> +
> +LIBBPF_0.0.4 {
> + global:
> + bpf_object__load_xattr;
> +} LIBBPF_0.0.3;
>

Re: [PATCH bpf-next v2 1/3] tools: bpftool: add -d option to get debug output from libbpf

2019-05-23 Thread Andrii Nakryiko

On Thu, May 23, 2019 at 3:54 AM Quentin Monnet
 wrote:
>
> libbpf has three levels of priority for output messages: warn, info,
> debug. By default, debug output is not printed to the console.
>
> Add a new "--debug" (short name: "-d") option to bpftool to print libbpf
> logs for all three levels.
>
> Internally, we simply use the function provided by libbpf to replace the
> default printing function by one that prints logs regardless of their
> level.
>
> v2:
> - Remove the possibility to select the log-levels to use (v1 offered a
>   combination of "warn", "info" and "debug").
> - Rename option and offer a short name: -d|--debug.

Such and option in CLI tools is usually called -v|--verbose, I'm
wondering if it might be a better name choice?

Btw, some tools also use -v, -vv and -vvv to define different levels
of verbosity, which is something we can consider in the future, as
it's backwards compatible.

> - Add option description to all bpftool manual pages (instead of
>   bpftool-prog.rst only), as all commands use libbpf.
>
> Signed-off-by: Quentin Monnet 
> Reviewed-by: Jakub Kicinski 
> ---
>  tools/bpf/bpftool/Documentation/bpftool-btf.rst|  4 
>  tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |  4 
>  .../bpf/bpftool/Documentation/bpftool-feature.rst  |  4 
>  tools/bpf/bpftool/Documentation/bpftool-map.rst|  4 
>  tools/bpf/bpftool/Documentation/bpftool-net.rst|  4 
>  tools/bpf/bpftool/Documentation/bpftool-perf.rst   |  4 
>  tools/bpf/bpftool/Documentation/bpftool-prog.rst   |  4 
>  tools/bpf/bpftool/Documentation/bpftool.rst|  3 +++
>  tools/bpf/bpftool/bash-completion/bpftool  |  2 +-
>  tools/bpf/bpftool/main.c   | 14 +-
>  10 files changed, 45 insertions(+), 2 deletions(-)
>
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
> index 2dbc1413fabd..00668df1bf7a 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
> @@ -67,6 +67,10 @@ OPTIONS
> -p, --pretty
>   Generate human-readable JSON output. Implies **-j**.
>
> +   -d, --debug
> + Print all logs available from libbpf, including debug-level
> + information.
> +
>  EXAMPLES
>  
>  **# bpftool btf dump id 1226**
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
> index ac26876389c2..36807735e2a5 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
> @@ -113,6 +113,10 @@ OPTIONS
> -f, --bpffs
>   Show file names of pinned programs.
>
> +   -d, --debug
> + Print all logs available from libbpf, including debug-level
> + information.
> +
>  EXAMPLES
>  
>  |
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
> index 14180e887082..4d08f35034a2 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
> @@ -73,6 +73,10 @@ OPTIONS
> -p, --pretty
>   Generate human-readable JSON output. Implies **-j**.
>
> +   -d, --debug
> + Print all logs available from libbpf, including debug-level
> + information.
> +
>  SEE ALSO
>  
> **bpf**\ (2),
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> index 13ef27b39f20..490b4501cb6e 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
> @@ -152,6 +152,10 @@ OPTIONS
>   Do not automatically attempt to mount any virtual file 
> system
>   (such as tracefs or BPF virtual file system) when necessary.
>
> +   -d, --debug
> + Print all logs available from libbpf, including debug-level
> + information.
> +
>  EXAMPLES
>  
>  **# bpftool map show**
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-net.rst
> index 934580850f42..d8e5237a2085 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
> @@ -65,6 +65,10 @@ OPTIONS
> -p, --pretty
>   Generate human-readable JSON output. Implies **-j**.
>
> +   -d, --debug
> + Print all logs available from libbpf, including debug-level
> + information.
> +
>  EXAMPLES
>  
>
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
> index 0c7576523a21..e252bd0bc434 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-

Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics

2019-05-23 Thread Edward Cree

On 22/05/2019 23:20, Jakub Kicinski wrote:
> On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote:
>> * removed RFC tags
> Why?  There is still no upstream user for this
Well, patch #2 updates drivers to the changed API, which is kinda an
 "upstream user" if you squint... admittedly patch #1 is a bit dubious
 in that regard; I was kinda hoping someone on one of the existing
 drivers would either drop in a patch or at least point me at the info
 needed to write one for their HW, but no luck :-(
My defence re patch #1 is that I'm not really adding a 'new feature'
 to the kernel, just restoring something that used to be there.  It's
 a grey area and I'm waiting for the maintainer(s) to say yea or nay
 (Jiri noped v1 but that had a much bigger unused interface (the
 TC_CLSFLOWER_STATS_BYINDEX callback) so I'm still hoping).
> (my previous objections of this being only partially correct aside).
I didn't realise you still had outstanding objections, sorry.
Regarding RTM_GETACTION dumping, it should be possible to add an offload
 callback based on this, which would just pass the action cookie to the
 driver.  Then the driver just has to look the cookie up in its stats
 tables to find the counter.  That would deal with the 'stale stats'
 issue.
But right now, that really _would_ be an API without a user; still, I
 might be able to do that as an RFC patch to prove it's possible, would
 that help this series to go in as-is?

-Ed

Re: [PATCH bpf-next 10/12] bpftool: add C output format option to btf dump subcommand

2019-05-23 Thread Jakub Kicinski

On Wed, 22 May 2019 21:43:43 -0700, Andrii Nakryiko wrote:
> On Wed, May 22, 2019 at 6:23 PM Jakub Kicinski wrote:
> > On Wed, 22 May 2019 17:58:23 -0700, Andrii Nakryiko wrote:  
> > > On Wed, May 22, 2019 at 5:25 PM Jakub Kicinski wrote:  
> > > > On Wed, 22 May 2019 12:50:51 -0700, Andrii Nakryiko wrote:  
> > > > > + * Copyright (C) 2019 Facebook
> > > > > + */
> > > > >
> > > > >  #include 
> > > > >  #include 
> > > > > @@ -340,11 +347,48 @@ static int dump_btf_raw(const struct btf *btf,
> > > > >   return 0;
> > > > >  }
> > > > >
> > > > > +static void btf_dump_printf(void *ctx, const char *fmt, va_list args)
> > > > > +{
> > > > > + vfprintf(stdout, fmt, args);
> > > > > +}
> > > > > +
> > > > > +static int dump_btf_c(const struct btf *btf,
> > > > > +   __u32 *root_type_ids, int root_type_cnt)  
> > > >
> > > > Please break the line after static int.  
> > >
> > > I don't mind, but it seems that prevalent formatting for such cases
> > > (at least in bpftool code base) is aligning arguments and not break
> > > static  into separate line:
> > >
> > > // multi-line function definitions with static on the same line
> > > $ rg '^static \w+.*\([^\)]*$' | wc -l
> > > 45
> > > // multi-line function definitions with static on separate line
> > > $ rg '^static \w+[^\(\{;]*$' | wc -l
> > > 12
> > >
> > > So I don't mind changing, but which one is canonical way of formatting?  
> >
> > Not really, just my preference :)  
> 
> I'll stick to majority :) I feel like it's also a preferred style in
> libbpf, so I'd rather converge to that.

Majority is often wrong or at least lazy.  But yeah, this is a waste of
time.  Do whatever.  You also use inline keyword in C files in your
libbpf patches..  I think kernel style rules should apply.

> > In my experience having the return type on a separate line if its
> > longer than a few chars is the simplest rule for consistent and good
> > looking code.
> >  
> > > > > + d = btf_dump__new(btf, NULL, NULL, btf_dump_printf);
> > > > > + if (IS_ERR(d))
> > > > > + return PTR_ERR(d);
> > > > > +
> > > > > + if (root_type_cnt) {
> > > > > + for (i = 0; i < root_type_cnt; i++) {
> > > > > + err = btf_dump__dump_type(d, root_type_ids[i]);
> > > > > + if (err)
> > > > > + goto done;
> > > > > + }
> > > > > + } else {
> > > > > + int cnt = btf__get_nr_types(btf);
> > > > > +
> > > > > + for (id = 1; id <= cnt; id++) {
> > > > > + err = btf_dump__dump_type(d, id);
> > > > > + if (err)
> > > > > + goto done;
> > > > > + }
> > > > > + }
> > > > > +
> > > > > +done:
> > > > > + btf_dump__free(d);
> > > > > + return err;  
> > > >
> > > > What do we do for JSON output?  
> > >
> > > Still dump C syntax. What do you propose? Error out if json enabled?  
> >
> > I wonder.  Letting it just print C is going to confuse anything that
> > just feeds the output into a JSON parser.  I'd err on the side of
> > returning an error, we can always relax that later if we find a use
> > case of returning C syntax via JSON.  
> 
> Ok, I'll emit error (seems like pr_err automatically handles JSON
> output, which is very nice).

Thanks

> > > > >   if (!btf) {
> > > > >   err = btf__get_from_id(btf_id, &btf);
> > > > >   if (err) {
> > > > > @@ -444,7 +498,10 @@ static int do_dump(int argc, char **argv)
> > > > >   }
> > > > >   }
> > > > >
> > > > > - dump_btf_raw(btf, root_type_ids, root_type_cnt);
> > > > > + if (dump_c)
> > > > > + dump_btf_c(btf, root_type_ids, root_type_cnt);
> > > > > + else
> > > > > + dump_btf_raw(btf, root_type_ids, root_type_cnt);
> > > > >
> > > > >  done:
> > > > >   close(fd);
> > > > > @@ -460,7 +517,7 @@ static int do_help(int argc, char **argv)
> > > > >   }
> > > > >
> > > > >   fprintf(stderr,
> > > > > - "Usage: %s btf dump BTF_SRC\n"
> > > > > + "Usage: %s btf dump BTF_SRC [c]\n"  
> > > >
> > > > bpftool generally uses  formats.  So perhaps we could do
> > > > something like "[format raw|c]" here for consistency, defaulting to 
> > > > raw?  
> > >
> > > That's not true for options, though. I see that at cgroup, prog, and
> > > some map subcommands (haven't checked all other) just accept a list of
> > > options without extra identifying key.  
> >
> > Yeah, we weren't 100% enforcing this rule and it's a bit messy now :/  
> 
> Unless you feel very strongly about this, it seems ok to me to allow
> "boolean options" (similarly to boolean --flag args) as a stand-alone
> set of tags. bpftool invocations are already very verbose, no need to
> add to that. Plus it also makes bash-completion simpler, it's always
> good not to complicate bash script unnecessarily :)

It's more of a question if we're goin

Re: [PATCH bpf-next v2 2/3] libbpf: add bpf_object__load_xattr() API function to pass log_level

2019-05-23 Thread Yonghong Song



On 5/23/19 9:19 AM, Yonghong Song wrote:
> 
> 
> On 5/23/19 3:54 AM, Quentin Monnet wrote:
>> libbpf was recently made aware of the log_level attribute for programs,
>> used to specify the level of information expected to be dumped by the
>> verifier.
>>
>> Create an API function to pass additional attributes when loading a
>> bpf_object, so we can set this log_level value in programs when loading
>> them, and so that so that applications relying on libbpf but not calling
> "so that so that" => "so that"
>> bpf_prog_load_xattr() can also use that feature.
> 
> Do not fully understand the above statement. From the code below,
> I did not see how the non-zero log_level can be set for bpf_program
> without bpf_prog_load_xattr(). Maybe I miss something?

Looks like next patch uses it when -d is specified.
Probably commit message can be made more clear.

> 
>>
>> v2:
>> - We are in a new cycle, bump libbpf extraversion number.
>>
>> Signed-off-by: Quentin Monnet 
>> Reviewed-by: Jakub Kicinski 
>> ---
>>   tools/lib/bpf/Makefile   |  2 +-
>>   tools/lib/bpf/libbpf.c   | 20 +---
>>   tools/lib/bpf/libbpf.h   |  6 ++
>>   tools/lib/bpf/libbpf.map |  5 +
>>   4 files changed, 29 insertions(+), 4 deletions(-)
>>
>> diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
>> index a2aceadf68db..9312066a1ae3 100644
>> --- a/tools/lib/bpf/Makefile
>> +++ b/tools/lib/bpf/Makefile
>> @@ -3,7 +3,7 @@
>>   BPF_VERSION = 0
>>   BPF_PATCHLEVEL = 0
>> -BPF_EXTRAVERSION = 3
>> +BPF_EXTRAVERSION = 4
>>   MAKEFLAGS += --no-print-directory
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 197b574406b3..1c6fb7a3201e 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -,7 +,7 @@ static bool 
>> bpf_program__is_function_storage(struct bpf_program *prog,
>>   }
>>   static int
>> -bpf_object__load_progs(struct bpf_object *obj)
>> +bpf_object__load_progs(struct bpf_object *obj, int log_level)
>>   {
>>   size_t i;
>>   int err;
>> @@ -2230,6 +2230,7 @@ bpf_object__load_progs(struct bpf_object *obj)
>>   for (i = 0; i < obj->nr_programs; i++) {
>>   if (bpf_program__is_function_storage(&obj->programs[i], obj))
>>   continue;
>> +    obj->programs[i].log_level = log_level;
>>   err = bpf_program__load(&obj->programs[i],
>>   obj->license,
>>   obj->kern_version);
>> @@ -2381,10 +2382,14 @@ int bpf_object__unload(struct bpf_object *obj)
>>   return 0;
>>   }
>> -int bpf_object__load(struct bpf_object *obj)
>> +int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
>>   {
>> +    struct bpf_object *obj;
>>   int err;
>> +    if (!attr)
>> +    return -EINVAL;
>> +    obj = attr->obj;
>>   if (!obj)
>>   return -EINVAL;
>> @@ -2397,7 +2402,7 @@ int bpf_object__load(struct bpf_object *obj)
>>   CHECK_ERR(bpf_object__create_maps(obj), err, out);
>>   CHECK_ERR(bpf_object__relocate(obj), err, out);
>> -    CHECK_ERR(bpf_object__load_progs(obj), err, out);
>> +    CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
>>   return 0;
>>   out:
>> @@ -2406,6 +2411,15 @@ int bpf_object__load(struct bpf_object *obj)
>>   return err;
>>   }
>> +int bpf_object__load(struct bpf_object *obj)
>> +{
>> +    struct bpf_object_load_attr attr = {
>> +    .obj = obj,
>> +    };
>> +
>> +    return bpf_object__load_xattr(&attr);
>> +}
>> +
>>   static int check_path(const char *path)
>>   {
>>   char *cp, errmsg[STRERR_BUFSIZE];
>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>> index c5ff00515ce7..e1c748db44f6 100644
>> --- a/tools/lib/bpf/libbpf.h
>> +++ b/tools/lib/bpf/libbpf.h
>> @@ -89,8 +89,14 @@ LIBBPF_API int bpf_object__unpin_programs(struct 
>> bpf_object *obj,
>>   LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char 
>> *path);
>>   LIBBPF_API void bpf_object__close(struct bpf_object *object);
>> +struct bpf_object_load_attr {
>> +    struct bpf_object *obj;
>> +    int log_level;
>> +};
>> +
>>   /* Load/unload object into/from kernel */
>>   LIBBPF_API int bpf_object__load(struct bpf_object *obj);
>> +LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr 
>> *attr);
>>   LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
>>   LIBBPF_API const char *bpf_object__name(struct bpf_object *obj);
>>   LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj);
>> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
>> index 673001787cba..6ce61fa0baf3 100644
>> --- a/tools/lib/bpf/libbpf.map
>> +++ b/tools/lib/bpf/libbpf.map
>> @@ -164,3 +164,8 @@ LIBBPF_0.0.3 {
>>   bpf_map_freeze;
>>   btf__finalize_data;
>>   } LIBBPF_0.0.2;
>> +
>> +LIBBPF_0.0.4 {
>> +    global:
>> +    bpf_object__load_xattr;
>> +} LIBBPF_0.0.3;
>>

Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics

2019-05-23 Thread Jakub Kicinski

On Thu, 23 May 2019 17:21:49 +0100, Edward Cree wrote:
> On 22/05/2019 23:20, Jakub Kicinski wrote:
> > On Wed, 22 May 2019 22:37:16 +0100, Edward Cree wrote:  
> >> * removed RFC tags  
> > Why?  There is still no upstream user for this  
> Well, patch #2 updates drivers to the changed API, which is kinda an
>  "upstream user" if you squint... admittedly patch #1 is a bit dubious
>  in that regard;

Both 1 and 2 are dubious if you ask me.  Complexity added for no
upstream gain.  3 is a good patch, perhaps worth posting it separately
rather than keeping it hostage to the rest of the series? :)

Side note - it's not clear why open code the loop in every driver rather
than having flow_stats_update() handle the looping?

Re: [PATCH net-next] cxgb4: use firmware API for validating filter spec

2019-05-23 Thread David Miller

From: Raju Rangoju 
Date: Thu, 23 May 2019 19:21:21 +0530

> Adds support for validating hardware filter spec configured in firmware
> before offloading exact match flows.
> 
> Use the new fw api FW_PARAM_DEV_FILTER_MODE_MASK to read the filter mode
> and mask from firmware. If the api isn't supported, then fall-back to
> older way of reading just the mode from indirect register.
> 
> Signed-off-by: Raju Rangoju 

Applied.

Re: [PATCH net-next] Revert "dpaa2-eth: configure the cache stashing amount on a queue"

2019-05-23 Thread David Miller

From: Ioana Radulescu 
Date: Thu, 23 May 2019 17:38:22 +0300

> This reverts commit f8b995853444aba9c16c1ccdccdd397527fde96d.
> 
> The reverted change instructed the QMan hardware block to fetch
> RX frame annotation and beginning of frame data to cache before
> the core would read them.
> 
> It turns out that in rare cases, it's possible that a QMan
> stashing transaction is delayed long enough such that, by the time
> it gets executed, the frame in question had already been dequeued
> by the core and software processing began on it. If the core
> manages to unmap the frame buffer _before_ the stashing transaction
> is executed, an SMMU exception will be raised.
> 
> Unfortunately there is no easy way to work around this while keeping
> the performance advantages brought by QMan stashing, so disable
> it altogether.
> 
> Signed-off-by: Ioana Radulescu 

Applied.

Re: [PATCH bpf-next v2 3/3] tools: bpftool: make -d option print debug output from verifier

2019-05-23 Thread Yonghong Song



On 5/23/19 3:54 AM, Quentin Monnet wrote:
> The "-d" option is used to require all logs available for bpftool. So
> far it meant telling libbpf to print even debug-level information. But
> there is another source of info that can be made more verbose: when we
> attemt to load programs with bpftool, we can pass a log_level parameter
> to the verifier in order to control the amount of information that is
> printed to the console.
> 
> Reuse the "-d" option to print all information the verifier can tell. At
> this time, this means logs related to BPF_LOG_LEVEL1, BPF_LOG_LEVEL2 and
> BPF_LOG_STATS. As mentioned in the discussion on the first version of
> this set, these macros are internal to the kernel
> (include/linux/bpf_verifier.h) and are not meant to be part of the
> stable user API, therefore we simply use the related constants to print
> whatever we can at this time, without trying to tell users what is
> log_level1 or what is statistics.
> 
> Verifier logs are only used when loading programs for now (in the
> future: for loading BTF objects with bpftool?), so bpftool.rst and
> bpftool-prog.rst are the only man pages to get the update.

The current BTF error log print out at warning level. So by default,
it should print out error logs unless it is suppressed explicitly.

int btf__load(struct btf *btf)
{
 __u32 log_buf_size = BPF_LOG_BUF_SIZE;
 char *log_buf = NULL;
 int err = 0;

 if (btf->fd >= 0)
 return -EEXIST;

 log_buf = malloc(log_buf_size);
 if (!log_buf)
 return -ENOMEM;

 *log_buf = 0;

 btf->fd = bpf_load_btf(btf->data, btf->data_size,
log_buf, log_buf_size, false);
 if (btf->fd < 0) {
 err = -errno;
 pr_warning("Error loading BTF: %s(%d)\n", 
strerror(errno), errno);
 if (*log_buf)
 pr_warning("%s\n", log_buf);
 goto done;
 }

done:
 free(log_buf);
 return err;
}

> 
> v2:
> - Remove the possibility to select the log levels to use (v1 offered a
>combination of "log_level1", "log_level2" and "stats").
> - The macros from kernel header bpf_verifier.h are not used (and
>therefore not moved to UAPI header).
> - In v1 this was a distinct option, but is now merged in the only "-d"
>switch to activate libbpf and verifier debug-level logs all at the
>same time.
> 
> Signed-off-by: Quentin Monnet 
> Reviewed-by: Jakub Kicinski 
> ---
>   .../bpftool/Documentation/bpftool-prog.rst|  5 ++--
>   tools/bpf/bpftool/Documentation/bpftool.rst   |  5 ++--
>   tools/bpf/bpftool/main.c  |  2 ++
>   tools/bpf/bpftool/main.h  |  1 +
>   tools/bpf/bpftool/prog.c  | 27 ---
>   5 files changed, 26 insertions(+), 14 deletions(-)
> 
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
> b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> index 9a92614569e6..228a5c863cc7 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> @@ -175,8 +175,9 @@ OPTIONS
> (such as tracefs or BPF virtual file system) when necessary.
>   
>   -d, --debug
> -   Print all logs available from libbpf, including debug-level
> -   information.
> +   Print all logs available, even debug-level information. This
> +   includes logs from libbpf as well as from the verifier, when
> +   attempting to load programs.
>   
>   EXAMPLES
>   
> diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst 
> b/tools/bpf/bpftool/Documentation/bpftool.rst
> index 43dba0717953..6a9c52ef84a9 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool.rst
> @@ -67,8 +67,9 @@ OPTIONS
> (such as tracefs or BPF virtual file system) when necessary.
>   
>   -d, --debug
> -   Print all logs available from libbpf, including debug-level
> -   information.
> +   Print all logs available, even debug-level information. This
> +   includes logs from libbpf as well as from the verifier, when
> +   attempting to load programs.
>   
>   SEE ALSO
>   
> diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
> index d74293938a05..4879f6395c7e 100644
> --- a/tools/bpf/bpftool/main.c
> +++ b/tools/bpf/bpftool/main.c
> @@ -26,6 +26,7 @@ bool pretty_output;
>   bool json_output;
>   bool show_pinned;
>   bool block_mount;
> +bool verifier_logs;
>   int bpf_flags;
>   struct pinned_obj_table prog_table;
>   struct pinned_obj_table map_table;
> @@ -373,6 +374,7 @@ int main(int argc, char **argv)
>   break;
>   case 'd':
>   libbpf_set_print(print_all_levels);
> + ver

Re: [PATCH net] cxgb4: offload VLAN flows regardless of VLAN ethtype

2019-05-23 Thread David Miller

From: Raju Rangoju 
Date: Thu, 23 May 2019 20:41:44 +0530

> VLAN flows never get offloaded unless ivlan_vld is set in filter spec.
> It's not compulsory for vlan_ethtype to be set.
> 
> So, always enable ivlan_vld bit for offloading VLAN flows regardless of
> vlan_ethtype is set or not.
> 
> Fixes: ad9af3e09c (cxgb4: add tc flower match support for vlan)
> Signed-off-by: Raju Rangoju 

Applied and queued up for -stable.

Re: [PATCH v3 net-next 0/3] flow_offload: Re-add per-action statistics

2019-05-23 Thread Edward Cree

On 23/05/2019 17:11, Jakub Kicinski wrote:
> On Thu, 23 May 2019 09:19:49 -0400, Jamal Hadi Salim wrote:
>> That would still work here, no? There will be some latency
>> based on the frequency of hardware->kernel stats updates.
> I don't think so, I think the stats are only updated on classifier
> dumps in Ed's code.
Yep currently that's the case, but not as an inherent restriction (see
 my other mail).

> But we can't be 100% sure without seeing driver code.
Would it help if I posted my driver code to the list?  It's gonna be
 upstream eventually anyway, it's just that the driver as a whole
 isn't really in a shape to be merged just yet (mainly 'cos the
 hardware folks are planning some breaking changes).  But I can post
 my TC handling code, or even the whole driver, if demonstrating how
 these interfaces can be used will help matters.

-Ed

[PATCH net-next] selftests/net: SO_TXTIME with ETF and FQ

2019-05-23 Thread Willem de Bruijn

The SO_TXTIME API enables packet tranmission with delayed delivery.
This is currently supported by the ETF and FQ packet schedulers.

Evaluate the interface with both schedulers. Install the scheduler
and send a variety of packets streams: without delay, with one
delayed packet, with multiple ordered delays and with reordering.
Verify that packets are released by the scheduler in expected order.

The ETF qdisc requires a timestamp in the future on every packet. It
needs a delay on the qdisc else the packet is dropped on dequeue for
having a delivery time in the past. The test value is experimentally
derived. ETF requires clock_id CLOCK_TAI. It checks this base and
drops for non-conformance.

The FQ qdisc expects clock_id CLOCK_MONOTONIC, the base used by TCP
as of commit fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC").
Within a flow there is an expecation of ordered delivery, as shown by
delivery times of test 4. The FQ qdisc does not require all packets to
have timestamps and does not drop for non-conformance.

The large (msec) delays are chosen to avoid flakiness.

Output:

SO_TXTIME ipv6 clock monolithic
payload:a delay:33 expected:0 (us)

SO_TXTIME ipv4 clock monolithic
payload:a delay:44 expected:0 (us)

SO_TXTIME ipv6 clock monolithic
payload:a delay:10049 expected:1 (us)

SO_TXTIME ipv4 clock monolithic
payload:a delay:10105 expected:1 (us)

[.. etc ..]

OK. All tests passed

Signed-off-by: Willem de Bruijn 
---
 tools/testing/selftests/net/.gitignore   |   1 +
 tools/testing/selftests/net/Makefile |   3 +-
 tools/testing/selftests/net/config   |   2 +
 tools/testing/selftests/net/so_txtime.c  | 297 +++
 tools/testing/selftests/net/so_txtime.sh |  31 +++
 5 files changed, 333 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/so_txtime.c
 create mode 100755 tools/testing/selftests/net/so_txtime.sh

diff --git a/tools/testing/selftests/net/.gitignore 
b/tools/testing/selftests/net/.gitignore
index 6f81130605d7d..27ef4d07ac915 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -17,3 +17,4 @@ tcp_inq
 tls
 txring_overwrite
 ip_defrag
+so_txtime
diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 1e6d14d2825cc..8af7869e0f1c8 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -9,12 +9,13 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh 
netdevice.sh \
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
 TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh 
reuseport_addr_any.sh
-TEST_PROGS += test_vxlan_fdb_changelink.sh
+TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
 TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite
 TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag
+TEST_GEN_FILES += so_txtime
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
diff --git a/tools/testing/selftests/net/config 
b/tools/testing/selftests/net/config
index 4740404486018..89f84b5118bfa 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -25,3 +25,5 @@ CONFIG_NF_TABLES_IPV6=y
 CONFIG_NF_TABLES_IPV4=y
 CONFIG_NFT_CHAIN_NAT_IPV6=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
+CONFIG_NET_SCH_FQ=m
+CONFIG_NET_SCH_ETF=m
diff --git a/tools/testing/selftests/net/so_txtime.c 
b/tools/testing/selftests/net/so_txtime.c
new file mode 100644
index 0..d5e9f6807be43
--- /dev/null
+++ b/tools/testing/selftests/net/so_txtime.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test the SO_TXTIME API
+ *
+ * Takes two streams of { payload, delivery time }[], one input and one output.
+ * Sends the input stream and verifies arrival matches the output stream.
+ * The two streams can differ due to out-of-order delivery and drops.
+ */
+
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static int cfg_clockid = CLOCK_TAI;
+static boolcfg_do_ipv4;
+static boolcfg_do_ipv6;
+static uint16_tcfg_port= 8000;
+static int cfg_variance_us = 2000;
+
+static uint64_t glob_tstart;
+
+/* encode one timed transmission (of a 1B payload) */
+struct timed_send {
+   chardata;
+   int64_t delay_us;
+};
+
+#define MAX_NUM_PKT8
+static struct timed_send cfg_in[MAX_NUM_PKT];
+static struct timed_send cfg_out[MAX_NUM_PKT];
+static int cfg_num_pkt;
+
+static uint64_t gett

1 2 3 >

1 - 100 of 213 matches

Mail list logo