date:20190705

[PATCH net-next v2 0/3] devlink: Introduce PCI PF, VF ports and attributes

2019-07-05 Thread Parav Pandit

This patchset carry forwards the work initiated in [1] and discussion
futher concluded at [2].

To improve visibility of representor netdevice, its association with
PF or VF, physical port, two new devlink port flavours are added as
PCI PF and PCI VF ports.

A sample eswitch view can be seen below, which will be futher extended to
mdev subdevices of a PCI function in future.

Patch-1,2 extends devlink port attributes and port flavour.
Patch-3 extends mlx5 driver to register devlink ports for PF, VF and
physical link.

+---+  +---+
  vf|   |  |   | pf
+-+-+  +-+-+
physical link <-+ |  |
| |  |
| |  |
  +-+-+ +-+-+  +-+-+
  | 1 | | 2 |  | 3 |
   +--+---+-+---+--+---+--+
   |  physical   vf pf|
   |  port   port   port  |
   |  |
   | eswitch  |
   |  |
   +--+

[1] https://www.spinics.net/lists/netdev/msg555797.html
[2] https://marc.info/?l=linux-netdev&m=155354609408485&w=2

---
Changelog:
v1->v2:
 - Updated new APIs and mlx5 driver to drop port_number for PF, VF
   attributes
 - Updated port_number comment for its usage
 - Limited putting port_number to physical ports

Parav Pandit (3):
  devlink: Introduce PCI PF port flavour and port attribute
  devlink: Introduce PCI VF port flavour and port attribute
  net/mlx5e: Register devlink ports for physical link, PCI PF, VFs

 .../net/ethernet/mellanox/mlx5/core/en_rep.c  | 108 -
 .../net/ethernet/mellanox/mlx5/core/en_rep.h  |   1 +
 include/net/devlink.h |  25 +++-
 include/uapi/linux/devlink.h  |  11 ++
 net/core/devlink.c| 114 +++---
 5 files changed, 212 insertions(+), 47 deletions(-)

-- 
2.19.2

[PATCH net-next v2 3/3] net/mlx5e: Register devlink ports for physical link, PCI PF, VFs

2019-07-05 Thread Parav Pandit

Register devlink port of physical port, PCI PF and PCI VF flavour
for each PF, VF when a given devlink instance is in switchdev mode.

Implement ndo_get_devlink_port callback API to make use of registered
devlink ports.
This eliminates ndo_get_phys_port_name() and ndo_get_port_parent_id()
callbacks. Hence, remove them.

An example output with 2 VFs, without a PF and single uplink port is
below.

$devlink port show
pci/:06:00.0/65535: type eth netdev ens2f0 flavour physical
pci/:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
pci/:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1

Reviewed-by: Roi Dayan 
Signed-off-by: Parav Pandit 
---
Changelog:
v1->v2:
 - Updated to use simpler API without port_number
---
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  | 108 +-
 .../net/ethernet/mellanox/mlx5/core/en_rep.h  |   1 +
 2 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 6a013a8c1150..ce50d8c9df03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "eswitch.h"
 #include "en.h"
@@ -1119,32 +1120,6 @@ static int mlx5e_rep_close(struct net_device *dev)
return ret;
 }
 
-static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
-   char *buf, size_t len)
-{
-   struct mlx5e_priv *priv = netdev_priv(dev);
-   struct mlx5e_rep_priv *rpriv = priv->ppriv;
-   struct mlx5_eswitch_rep *rep = rpriv->rep;
-   unsigned int fn;
-   int ret;
-
-   fn = PCI_FUNC(priv->mdev->pdev->devfn);
-   if (fn >= MLX5_MAX_PORTS)
-   return -EOPNOTSUPP;
-
-   if (rep->vport == MLX5_VPORT_UPLINK)
-   ret = snprintf(buf, len, "p%d", fn);
-   else if (rep->vport == MLX5_VPORT_PF)
-   ret = snprintf(buf, len, "pf%d", fn);
-   else
-   ret = snprintf(buf, len, "pf%dvf%d", fn, rep->vport - 1);
-
-   if (ret >= len)
-   return -EOPNOTSUPP;
-
-   return 0;
-}
-
 static int
 mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
  struct tc_cls_flower_offload *cls_flower, int 
flags)
@@ -1298,17 +1273,24 @@ static int mlx5e_uplink_rep_set_vf_vlan(struct 
net_device *dev, int vf, u16 vlan
return 0;
 }
 
+static struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev)
+{
+   struct mlx5e_priv *priv = netdev_priv(dev);
+   struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+   return &rpriv->dl_port;
+}
+
 static const struct net_device_ops mlx5e_netdev_ops_rep = {
.ndo_open= mlx5e_rep_open,
.ndo_stop= mlx5e_rep_close,
.ndo_start_xmit  = mlx5e_xmit,
-   .ndo_get_phys_port_name  = mlx5e_rep_get_phys_port_name,
.ndo_setup_tc= mlx5e_rep_setup_tc,
+   .ndo_get_devlink_port = mlx5e_get_devlink_port,
.ndo_get_stats64 = mlx5e_rep_get_stats,
.ndo_has_offload_stats   = mlx5e_rep_has_offload_stats,
.ndo_get_offload_stats   = mlx5e_rep_get_offload_stats,
.ndo_change_mtu  = mlx5e_rep_change_mtu,
-   .ndo_get_port_parent_id  = mlx5e_rep_get_port_parent_id,
 };
 
 static const struct net_device_ops mlx5e_netdev_ops_uplink_rep = {
@@ -1316,8 +1298,8 @@ static const struct net_device_ops 
mlx5e_netdev_ops_uplink_rep = {
.ndo_stop= mlx5e_close,
.ndo_start_xmit  = mlx5e_xmit,
.ndo_set_mac_address = mlx5e_uplink_rep_set_mac,
-   .ndo_get_phys_port_name  = mlx5e_rep_get_phys_port_name,
.ndo_setup_tc= mlx5e_rep_setup_tc,
+   .ndo_get_devlink_port = mlx5e_get_devlink_port,
.ndo_get_stats64 = mlx5e_get_stats,
.ndo_has_offload_stats   = mlx5e_rep_has_offload_stats,
.ndo_get_offload_stats   = mlx5e_rep_get_offload_stats,
@@ -1330,7 +1312,6 @@ static const struct net_device_ops 
mlx5e_netdev_ops_uplink_rep = {
.ndo_get_vf_config   = mlx5e_get_vf_config,
.ndo_get_vf_stats= mlx5e_get_vf_stats,
.ndo_set_vf_vlan = mlx5e_uplink_rep_set_vf_vlan,
-   .ndo_get_port_parent_id  = mlx5e_rep_get_port_parent_id,
.ndo_set_features= mlx5e_set_features,
 };
 
@@ -1731,6 +1712,55 @@ static const struct mlx5e_profile 
mlx5e_uplink_rep_profile = {
.max_tc = MLX5E_MAX_NUM_TC,
 };
 
+static bool
+is_devlink_port_supported(const struct mlx5_core_dev *dev,
+ const struct mlx5e_rep_priv *rpriv)
+{
+   return rpriv->rep->vport == MLX5_VPORT_UPLINK ||
+  rpriv->rep->vport == MLX5_VPORT_PF ||
+  mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport);
+}
+
+static int register_devlink_port(s

[PATCH net-next v2 2/3] devlink: Introduce PCI VF port flavour and port attribute

2019-07-05 Thread Parav Pandit

In an eswitch, PCI VF may have port which is normally represented using
a representor netdevice.
To have better visibility of eswitch port, its association with VF,
and its representor netdevice, introduce a PCI VF port flavour.

When devlink port flavour is PCI VF, fill up PCI VF attributes of
the port.

Extend port name creation using PCI PF and VF number scheme on best
effort basis, so that vendor drivers can skip defining their own scheme.

$ devlink port show
pci/:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0
pci/:05:00.0/1: type eth netdev eth1 flavour pcivf pfnum 0 vfnum 0
pci/:05:00.0/2: type eth netdev eth2 flavour pcivf pfnum 0 vfnum 1

Signed-off-by: Parav Pandit 
---
Changelog:
v1->v2:
 - Updated PCI VF attribute set API to not have port_number
---
 include/net/devlink.h| 10 ++
 include/uapi/linux/devlink.h |  6 ++
 net/core/devlink.c   | 34 ++
 3 files changed, 50 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 32badf7e0810..2b5cbc3f5a8b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -42,6 +42,11 @@ struct devlink_port_pci_pf_attrs {
u16 pf; /* Associated PCI PF for this port. */
 };
 
+struct devlink_port_pci_vf_attrs {
+   u16 pf; /* Associated PCI PF for this port. */
+   u16 vf; /* Associated PCI VF for of the PCI PF for this port. */
+};
+
 struct devlink_port_attrs {
u8 set:1,
   split:1,
@@ -55,6 +60,7 @@ struct devlink_port_attrs {
struct netdev_phys_item_id switch_id;
union {
struct devlink_port_pci_pf_attrs pci_pf;
+   struct devlink_port_pci_vf_attrs pci_vf;
};
 };
 
@@ -603,6 +609,10 @@ void devlink_port_attrs_set(struct devlink_port 
*devlink_port,
 void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
   const unsigned char *switch_id,
   unsigned char switch_id_len, u16 pf);
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+  const unsigned char *switch_id,
+  unsigned char switch_id_len,
+  u16 pf, u16 vf);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index f7323884c3fe..ffc993256527 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -173,6 +173,10 @@ enum devlink_port_flavour {
  * the PCI PF. It is an internal
  * port that faces the PCI PF.
  */
+   DEVLINK_PORT_FLAVOUR_PCI_VF, /* Represents eswitch port
+ * for the PCI VF. It is an internal
+ * port that faces the PCI VF.
+ */
 };
 
 enum devlink_param_cmode {
@@ -342,6 +346,8 @@ enum devlink_attr {
DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */
 
DEVLINK_ATTR_PORT_PCI_PF_NUMBER,/* u16 */
+   DEVLINK_ATTR_PORT_PCI_VF_NUMBER,/* u16 */
+
/* add new attributes above here, update the policy in devlink.c */
 
__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index c9418f1ce025..033f13ecf89f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -530,6 +530,12 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
attrs->pci_pf.pf))
return -EMSGSIZE;
+   } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) {
+   if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+   attrs->pci_vf.pf) ||
+   nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
+   attrs->pci_vf.vf))
+   return -EMSGSIZE;
}
if (!attrs->split)
return 0;
@@ -5827,6 +5833,30 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port 
*devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
 
+/**
+ * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ * @devlink_port: devlink port
+ * @pf: associated PF for the devlink port instance
+ * @vf: associated VF of a PF for the devlink port instance
+ * @switch_id: if the port is part of switch, this is buffer with ID,
+ * otwerwise this is NULL
+ * @switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+  const unsigned ch

[PATCH net-next v2 1/3] devlink: Introduce PCI PF port flavour and port attribute

2019-07-05 Thread Parav Pandit

In an eswitch, PCI PF may have port which is normally represented
using a representor netdevice.
To have better visibility of eswitch port, its association with
PF and a representor netdevice, introduce a PCI PF port
flavour and port attriute.

When devlink port flavour is PCI PF, fill up PCI PF attributes of the
port.

Extend port name creation using PCI PF number on best effort basis.
So that vendor drivers can skip defining their own scheme.

$ devlink port show
pci/:05:00.0/0: type eth netdev eth0 flavour pcipf pfnum 0

Signed-off-by: Parav Pandit 

---
Changelog:
v1->v2:
 - Limited port_num attribute to physical ports
 - Updated PCI PF attribute set API to not have port_number
---
 include/net/devlink.h| 15 ++-
 include/uapi/linux/devlink.h |  5 +++
 net/core/devlink.c   | 80 +---
 3 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6625ea068d5e..32badf7e0810 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -38,14 +38,24 @@ struct devlink {
char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_port_pci_pf_attrs {
+   u16 pf; /* Associated PCI PF for this port. */
+};
+
 struct devlink_port_attrs {
u8 set:1,
   split:1,
   switch_port:1;
enum devlink_port_flavour flavour;
-   u32 port_number; /* same value as "split group" */
+   u32 port_number; /* same value as "split group".
+ * Valid only when a port is physical and visible
+ * to the user for a given port flavour.
+ */
u32 split_subport_number;
struct netdev_phys_item_id switch_id;
+   union {
+   struct devlink_port_pci_pf_attrs pci_pf;
+   };
 };
 
 struct devlink_port {
@@ -590,6 +600,9 @@ void devlink_port_attrs_set(struct devlink_port 
*devlink_port,
u32 split_subport_number,
const unsigned char *switch_id,
unsigned char switch_id_len);
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
+  const unsigned char *switch_id,
+  unsigned char switch_id_len, u16 pf);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
u16 egress_pools_count, u16 ingress_tc_count,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 5287b42c181f..f7323884c3fe 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -169,6 +169,10 @@ enum devlink_port_flavour {
DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
   * interconnect port.
   */
+   DEVLINK_PORT_FLAVOUR_PCI_PF, /* Represents eswitch port for
+ * the PCI PF. It is an internal
+ * port that faces the PCI PF.
+ */
 };
 
 enum devlink_param_cmode {
@@ -337,6 +341,7 @@ enum devlink_attr {
DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,  /* u64 */
DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, /* u64 */
 
+   DEVLINK_ATTR_PORT_PCI_PF_NUMBER,/* u16 */
/* add new attributes above here, update the policy in devlink.c */
 
__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89c533778135..c9418f1ce025 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -506,6 +506,14 @@ static void devlink_notify(struct devlink *devlink, enum 
devlink_command cmd)
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
+static bool
+is_devlink_phy_port_num_supported(const struct devlink_port *dl_port)
+{
+   return (dl_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PHYSICAL ||
+   dl_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_CPU ||
+   dl_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_DSA);
+}
+
 static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 struct devlink_port *devlink_port)
 {
@@ -515,8 +523,14 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
-   if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+   if (is_devlink_phy_port_num_supported(devlink_port) &&
+   nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
return -EMSGSIZE;
+   if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
+   if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+   attrs->pci_pf.pf))
+   return -EMSGSIZE;
+   }

Re: [PATCH net-next 8/8] net: mscc: PTP Hardware Clock (PHC) support

2019-07-05 Thread Antoine Tenart

Hello Willem,

On Mon, Jul 01, 2019 at 11:12:06AM -0400, Willem de Bruijn wrote:
> On Mon, Jul 1, 2019 at 6:05 AM Antoine Tenart
>  wrote:
> 
> >  void ocelot_deinit(struct ocelot *ocelot)
> >  {
> > +   struct ocelot_port *port;
> > +   struct ocelot_skb *entry;
> > +   struct list_head *pos;
> > +   int i;
> > +
> > destroy_workqueue(ocelot->stats_queue);
> > mutex_destroy(&ocelot->stats_lock);
> > ocelot_ace_deinit();
> > +
> > +   for (i = 0; i < ocelot->num_phys_ports; i++) {
> > +   port = ocelot->ports[i];
> > +
> > +   list_for_each(pos, &port->skbs) {
> > +   entry = list_entry(pos, struct ocelot_skb, head);
> > +
> > +   list_del(pos);
> 
> list_for_each_safe

Right, I'll fix this for v2.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Re: bug: tpacket_snd can cause data corruption

2019-07-05 Thread Frank de Brabander


On 05-07-19 00:59, Willem de Bruijn wrote:


Can you reproduce the issue when running the modified test in a
network namespace (./in_netns.sh ./txring_overwrite)?

But even when running the test with ./in_netns.sh it shows
"wrong pattern", this time without length mismatches:

wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61
wrong pattern: 0x62 != 0x61

As already mentioned, it seems to trigger mainly (only ?) when
an USB device is connected. The PC I'm testing this on has an
USB hub with many ports and connected devices. When connecting
this USB hub, the amount of "wrong pattern" errors that are
shown seems to correlate to the amount of new devices
that the kernel should detect. Connecting in a single USB device
also triggers the error, but not on every attempt.

Unfortunately have not found any other way to force the
error to trigger. E.g. running stress-ng to generate CPU load or
timer interrupts does not seem to have any impact.

Interesting, thanks for testing. No exact idea so far. The USB devices
are not necessarily network devices, I suppose? I don't immediately
have a setup to test the usb hotplug, so cannot yet reproduce the bug.

It triggers with different types of USB devices. Verified the
bug can trigger with an USB flash drive, mouse, USB-serial
adapter and USB hub (also with no devices connected).

It can trigger when the USB device is connected as well as when
it's disconnected. But there is a bit of luck needed, it can take
a bunch of times before it happens. Using a large USB hub with
many connected devices will trigger it much easier.

[PATCH net-next] nfp: tls: fix error return code in nfp_net_tls_add()

2019-07-05 Thread Wei Yongjun

Fix to return negative error code -EINVAL from the error handling
case instead of 0, as done elsewhere in this function.

Fixes: 1f35a56cf586 ("nfp: tls: add/delete TLS TX connections")
Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/netronome/nfp/crypto/tls.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c 
b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
index 3ee829d69c04..9f7ccb7da417 100644
--- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c
+++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c
@@ -344,6 +344,7 @@ nfp_net_tls_add(struct net_device *netdev, struct sock *sk,
 
if (!reply->handle[0] && !reply->handle[1]) {
nn_dp_warn(&nn->dp, "FW returned NULL handle\n");
+   err = -EINVAL;
goto err_fw_remove;
}

Re: [PATCH bpf-next] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Quentin Monnet

2019-07-04 22:49 UTC-0700 ~ Y Song 
> On Thu, Jul 4, 2019 at 1:58 AM Quentin Monnet
>  wrote:
>>
>> Add a new "bpftool prog run" subcommand to run a loaded program on input
>> data (and possibly with input context) passed by the user.
>>
>> Print output data (and output context if relevant) into a file or into
>> the console. Print return value and duration for the test run into the
>> console.
>>
>> A "repeat" argument can be passed to run the program several times in a
>> row.
>>
>> The command does not perform any kind of verification based on program
>> type (Is this program type allowed to use an input context?) or on data
>> consistency (Can I work with empty input data?), this is left to the
>> kernel.
>>
>> Example invocation:
>>
>> # perl -e 'print "\x0" x 14' | ./bpftool prog run \
>> pinned /sys/fs/bpf/sample_ret0 \
>> data_in - data_out - repeat 5
>> 000         |  ..
>> Return value: 0, duration (average): 260ns
>>
>> When one of data_in or ctx_in is "-", bpftool reads from standard input,
>> in binary format. Other formats (JSON, hexdump) might be supported (via
>> an optional command line keyword like "data_fmt_in") in the future if
>> relevant, but this would require doing more parsing in bpftool.
>>
>> Signed-off-by: Quentin Monnet 
>> Reviewed-by: Jakub Kicinski 
>> ---

[...]

>> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
>> index 9b0db5d14e31..8dcbaa0a8ab1 100644
>> --- a/tools/bpf/bpftool/prog.c
>> +++ b/tools/bpf/bpftool/prog.c
>> @@ -15,6 +15,7 @@
>>  #include 
>>
>>  #include 
>> +#include 
>>
>>  #include 
>>  #include 
>> @@ -748,6 +749,344 @@ static int do_detach(int argc, char **argv)
>> return 0;
>>  }
>>
>> +static int check_single_stdin(char *file_in, char *other_file_in)
>> +{
>> +   if (file_in && other_file_in &&
>> +   !strcmp(file_in, "-") && !strcmp(other_file_in, "-")) {
>> +   p_err("cannot use standard input for both data_in and 
>> ctx_in");
> 
> The error message says data_in and ctx_in.
> Maybe the input parameter should be file_data_in and file_ctx_in?


Hi Yonghong,

It's true those parameters should be file names. But having
"file_data_in", "file_data_out", "file_ctx_in" and "file_ctx_out" on a
command line seems a bit heavy to me? (And relying on keyword prefixing
for typing the command won't help much.)

My opinion is that it should be clear from the man page or the "help"
command that the parameters are file names. What do you think? I can
prefix all four arguments with "file_" if you believe this is better.

[...]

>> +static int do_run(int argc, char **argv)
>> +{
>> +   char *data_fname_in = NULL, *data_fname_out = NULL;
>> +   char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
>> +   struct bpf_prog_test_run_attr test_attr = {0};
>> +   const unsigned int default_size = SZ_32K;
>> +   void *data_in = NULL, *data_out = NULL;
>> +   void *ctx_in = NULL, *ctx_out = NULL;
>> +   unsigned int repeat = 1;
>> +   int fd, err;
>> +
>> +   if (!REQ_ARGS(4))
>> +   return -1;
>> +
>> +   fd = prog_parse_fd(&argc, &argv);
>> +   if (fd < 0)
>> +   return -1;
>> +
>> +   while (argc) {
>> +   if (detect_common_prefix(*argv, "data_in", "data_out",
>> +"data_size_out", NULL))
>> +   return -1;
>> +   if (detect_common_prefix(*argv, "ctx_in", "ctx_out",
>> +"ctx_size_out", NULL))
>> +   return -1;
>> +
>> +   if (is_prefix(*argv, "data_in")) {
>> +   NEXT_ARG();
>> +   if (!REQ_ARGS(1))
>> +   return -1;
>> +
>> +   data_fname_in = GET_ARG();
>> +   if (check_single_stdin(data_fname_in, ctx_fname_in))
>> +   return -1;
>> +   } else if (is_prefix(*argv, "data_out")) {
> 
> Here, we all use is_prefix() to match "data_in", "data_out",
> "data_size_out" etc.
> That means users can use "data_i" instead of "data_in" as below
>... | ./bpftool prog run id 283 data_i - data_out - repeat 5
> is this expected?
Yes, this is expected. We use prefix matching as we do pretty much
everywhere else in bpftool. It's not as useful here because most of the
strings for the names are similar. I agree that typing "data_i" instead
of "data_in" brings little advantage, but I see no reason why we should
reject prefixing for those keywords. And we accept "data_s" instead of
"data_size_out", which is still shorter to type than the complete keyword.

Thanks for the review!
Quentin

RE: [EXT] Re: [PATCH net-next v2 4/4] qed*: Add devlink support for configuration attributes.

2019-07-05 Thread Sudarsana Reddy Kalluru

> -Original Message-
> From: Jakub Kicinski 
> Sent: Friday, July 5, 2019 3:38 AM
> To: Sudarsana Reddy Kalluru 
> Cc: da...@davemloft.net; netdev@vger.kernel.org; Michal Kalderon
> ; Ariel Elior ; Jiri Pirko
> 
> Subject: [EXT] Re: [PATCH net-next v2 4/4] qed*: Add devlink support for
> configuration attributes.
> 
> External Email
> 
> --
> On Thu, 4 Jul 2019 06:20:11 -0700, Sudarsana Reddy Kalluru wrote:
> > This patch adds implementation for devlink callbacks for reading and
> > configuring the device attributes.
> >
> > Signed-off-by: Sudarsana Reddy Kalluru 
> > Signed-off-by: Ariel Elior 
> > ---
> >  Documentation/networking/devlink-params-qede.txt |  72 
> >  drivers/net/ethernet/qlogic/qed/qed_main.c   |  38 +
> >  drivers/net/ethernet/qlogic/qede/qede.h  |   3 +
> >  drivers/net/ethernet/qlogic/qede/qede_devlink.c  | 202
> > ++-
> drivers/net/ethernet/qlogic/qede/qede_devlink.h  |  23 +++
> >  include/linux/qed/qed_if.h   |  16 ++
> >  6 files changed, 353 insertions(+), 1 deletion(-)  create mode 100644
> > Documentation/networking/devlink-params-qede.txt
> >
> > diff --git a/Documentation/networking/devlink-params-qede.txt
> > b/Documentation/networking/devlink-params-qede.txt
> > new file mode 100644
> > index 000..f78a993
> > --- /dev/null
> > +++ b/Documentation/networking/devlink-params-qede.txt
> > @@ -0,0 +1,72 @@
> > +enable_sriov   [DEVICE, GENERIC]
> > +   Configuration mode: Permanent
> > +
> > +iwarp_cmt  [DEVICE, DRIVER-SPECIFIC]
> > +   Enable iWARP support over 100G device (CMT
> mode).
> > +   Type: Boolean
> > +   Configuration mode: runtime
> > +
> > +entity_id  [DEVICE, DRIVER-SPECIFIC]
> > +   Set the entity ID value to be used for this device
> > +   while reading/configuring the devlink attributes.
> > +   Type: u8
> > +   Configuration mode: runtime
> 
> Can you explain what this is?
Hardware/mfw provides the option to modify/read the config of other PFs. A 
non-zero entity id represents a partition number (or simply a PF-id) for which 
the config need to be read/updated.

> 
> > +device_capabilities[DEVICE, DRIVER-SPECIFIC]
> > +   Set the entity ID value to be used for this device
> > +   while reading/configuring the devlink attributes.
> > +   Type: u8
> > +   Configuration mode: runtime
> 
> Looks like you copied the previous text here.
Will update it, thanks.

> 
> > +mf_mode[DEVICE, DRIVER-SPECIFIC]
> > +   Configure Multi Function mode for the device.
> > +   Supported MF modes and the assoicated values are,
> > +   MF allowed(0), Default(1), SPIO4(2), NPAR1.0(3),
> > +   NPAR1.5(4), NPAR2.0(5), BD(6) and UFP(7)
> 
> NPAR should have a proper API in devlink port, what are the other modes?
> 
These are the different modes supported by the Marvell NIC. In our case the 
mf_mode is per adapter basis, e.g., it's not possible to configure one port in 
NPAR mode and the other in Default mode.

> > +   Type: u8
> > +   Configuration mode: Permanent
> > +
> > +dcbx_mode  [PORT, DRIVER-SPECIFIC]
> > +   Configure DCBX mode for the device.
> > +   Supported dcbx modes are,
> > +   Disabled(0), IEEE(1), CEE(2) and Dynamic(3)
> > +   Type: u8
> > +   Configuration mode: Permanent
> 
> Why is this a permanent parameter?
> 
This specifies the dcbx_mode to be configured in non-volatile memory. The value 
is persistent and is used in the next load of OS or the mfw.

> > +preboot_oprom  [PORT, DRIVER-SPECIFIC]
> > +   Enable Preboot Option ROM.
> > +   Type: Boolean
> > +   Configuration mode: Permanent
> 
> This should definitely not be a driver specific toggle.
> 
> > +preboot_boot_protocol  [PORT, DRIVER-SPECIFIC]
> > +   Configure preboot Boot protocol.
> > +   Possible values are,
> > +   PXE(0), iSCSI Boot(3), FCoE Boot(4) and NONE(7)
> > +   Type: u8
> > +   Configuration mode: Permanent
> 
> Ditto.
> 
> > +preboot_vlan   [PORT, DRIVER-SPECIFIC]
> > +   Preboot VLAN.
> > +   Type: u16
> > +   Configuration mode: Permanent
> > +
> > +preboot_vlan_value [PORT, DRIVER-SPECIFIC]
> > +   Configure Preboot VLAN value.
> > +   Type: u16
> > +   Configuration mode: Permanent
> 
> And these.
Sure, will add generic definitions for these.

> 
> > +mba_delay_

[PATCH 4/7] ipsec: select crypto ciphers for xfrm_algo

2019-07-05 Thread Steffen Klassert

From: Arnd Bergmann 

kernelci.org reports failed builds on arc because of what looks
like an old missed 'select' statement:

net/xfrm/xfrm_algo.o: In function `xfrm_probe_algs':
xfrm_algo.c:(.text+0x1e8): undefined reference to `crypto_has_ahash'

I don't see this in randconfig builds on other architectures, but
it's fairly clear we want to select the hash code for it, like we
do for all its other users. As Herbert points out, CRYPTO_BLKCIPHER
is also required even though it has not popped up in build tests.

Fixes: 17bc19702221 ("ipsec: Use skcipher and ahash when probing algorithms")
Signed-off-by: Arnd Bergmann 
Acked-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 1ec8071226b2..06a6928d0e62 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -14,6 +14,8 @@ config XFRM_ALGO
tristate
select XFRM
select CRYPTO
+   select CRYPTO_HASH
+   select CRYPTO_BLKCIPHER
 
 if INET
 config XFRM_USER
-- 
2.17.1

[PATCH 3/7] xfrm: fix sa selector validation

2019-07-05 Thread Steffen Klassert

From: Nicolas Dichtel 

After commit b38ff4075a80, the following command does not work anymore:
$ ip xfrm state add src 10.125.0.2 dst 10.125.0.1 proto esp spi 34 reqid 1 \
  mode tunnel enc 'cbc(aes)' 0xb0abdba8b782ad9d364ec81e3a7d82a1 auth-trunc \
  'hmac(sha1)' 0xe26609ebd00acb6a4d51fca13e49ea78a72c73e6 96 flag align4

In fact, the selector is not mandatory, allow the user to provide an empty
selector.

Fixes: b38ff4075a80 ("xfrm: Fix xfrm sel prefix length validation")
CC: Anirudh Gupta 
Signed-off-by: Nicolas Dichtel 
Acked-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_user.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 74a3d1e0ff63..6626564f1fb7 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -166,6 +166,9 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
}
 
switch (p->sel.family) {
+   case AF_UNSPEC:
+   break;
+
case AF_INET:
if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
goto out;
-- 
2.17.1

[PATCH 5/7] xfrm: remove a duplicated assignment

2019-07-05 Thread Steffen Klassert

From: Cong Wang 

Fixes: 30846090a746 ("xfrm: policy: add sequence count to sync with hash 
resize")
Cc: Florian Westphal 
Cc: Steffen Klassert 
Signed-off-by: Cong Wang 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_policy.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a43ae6b2a44..7eefdc9be2a7 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -581,9 +581,6 @@ static void xfrm_bydst_resize(struct net *net, int dir)
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
write_seqcount_begin(&xfrm_policy_hash_generation);
 
-   odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
-   lockdep_is_held(&net->xfrm.xfrm_policy_lock));
-
odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
 
-- 
2.17.1

[PATCH 6/7] xfrm: policy: fix bydst hlist corruption on hash rebuild

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

syzbot reported following spat:

BUG: KASAN: use-after-free in __write_once_size include/linux/compiler.h:221
BUG: KASAN: use-after-free in hlist_del_rcu include/linux/rculist.h:455
BUG: KASAN: use-after-free in xfrm_hash_rebuild+0xa0d/0x1000 
net/xfrm/xfrm_policy.c:1318
Write of size 8 at addr 888095e79c00 by task kworker/1:3/8066
Workqueue: events xfrm_hash_rebuild
Call Trace:
 __write_once_size include/linux/compiler.h:221 [inline]
 hlist_del_rcu include/linux/rculist.h:455 [inline]
 xfrm_hash_rebuild+0xa0d/0x1000 net/xfrm/xfrm_policy.c:1318
 process_one_work+0x814/0x1130 kernel/workqueue.c:2269
Allocated by task 8064:
 __kmalloc+0x23c/0x310 mm/slab.c:3669
 kzalloc include/linux/slab.h:742 [inline]
 xfrm_hash_alloc+0x38/0xe0 net/xfrm/xfrm_hash.c:21
 xfrm_policy_init net/xfrm/xfrm_policy.c:4036 [inline]
 xfrm_net_init+0x269/0xd60 net/xfrm/xfrm_policy.c:4120
 ops_init+0x336/0x420 net/core/net_namespace.c:130
 setup_net+0x212/0x690 net/core/net_namespace.c:316

The faulting address is the address of the old chain head,
free'd by xfrm_hash_resize().

In xfrm_hash_rehash(), chain heads get re-initialized without
any hlist_del_rcu:

 for (i = hmask; i >= 0; i--)
INIT_HLIST_HEAD(odst + i);

Then, hlist_del_rcu() gets called on the about to-be-reinserted policy
when iterating the per-net list of policies.

hlist_del_rcu() will then make chain->first be nonzero again:

static inline void __hlist_del(struct hlist_node *n)
{
   struct hlist_node *next = n->next;   // address of next element in list
   struct hlist_node **pprev = n->pprev;// location of previous elem, this
// can point at chain->first
WRITE_ONCE(*pprev, next);   // chain->first points to next elem
if (next)
next->pprev = pprev;

Then, when we walk chainlist to find insertion point, we may find a
non-empty list even though we're supposedly reinserting the first
policy to an empty chain.

To fix this first unlink all exact and inexact policies instead of
zeroing the list heads.

Add the commands equivalent to the syzbot reproducer to xfrm_policy.sh,
without fix KASAN catches the corruption as it happens, SLUB poisoning
detects it a bit later.

Reported-by: syzbot+0165480d4ef07360e...@syzkaller.appspotmail.com
Fixes: 1548bc4e0512 ("xfrm: policy: delete inexact policies from inexact list 
on hash rebuild")
Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_policy.c | 12 ++
 tools/testing/selftests/net/xfrm_policy.sh | 27 +-
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7eefdc9be2a7..c411662141ae 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1276,13 +1276,17 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 
hlist_for_each_entry_safe(policy, n,
  &net->xfrm.policy_inexact[dir],
- bydst_inexact_list)
+ bydst_inexact_list) {
+   hlist_del_rcu(&policy->bydst);
hlist_del_init(&policy->bydst_inexact_list);
+   }
 
hmask = net->xfrm.policy_bydst[dir].hmask;
odst = net->xfrm.policy_bydst[dir].table;
-   for (i = hmask; i >= 0; i--)
-   INIT_HLIST_HEAD(odst + i);
+   for (i = hmask; i >= 0; i--) {
+   hlist_for_each_entry_safe(policy, n, odst + i, bydst)
+   hlist_del_rcu(&policy->bydst);
+   }
if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) {
/* dir out => dst = remote, src = local */
net->xfrm.policy_bydst[dir].dbits4 = rbits4;
@@ -1311,8 +1315,6 @@ static void xfrm_hash_rebuild(struct work_struct *work)
chain = policy_hash_bysel(net, &policy->selector,
  policy->family, dir);
 
-   hlist_del_rcu(&policy->bydst);
-
if (!chain) {
void *p = xfrm_policy_inexact_insert(policy, dir, 0);
 
diff --git a/tools/testing/selftests/net/xfrm_policy.sh 
b/tools/testing/selftests/net/xfrm_policy.sh
index 71d7fdc513c1..5445943bf07f 100755
--- a/tools/testing/selftests/net/xfrm_policy.sh
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -257,6 +257,29 @@ check_exceptions()
return $lret
 }
 
+check_hthresh_repeat()
+{
+   local log=$1
+   i=0
+
+   for i in $(seq 1 10);do
+   ip -net ns1 xfrm policy update src e000:0001:: dst 
ff01::0014::0001 dir in tmpl src :: dst :: proto esp mode tunnel priority 
100 action allow || break
+   ip -net ns1 xfrm policy set hthresh6 0 28 || break
+
+   ip -net ns1 xfrm policy update src e000:0001

[PATCH 7/7] xfrm interface: fix memory leak on creation

2019-07-05 Thread Steffen Klassert

From: Nicolas Dichtel 

The following commands produce a backtrace and return an error but the xfrm
interface is created (in the wrong netns):
$ ip netns add foo
$ ip netns add bar
$ ip -n foo netns set bar 0
$ ip -n foo link add xfrmi0 link-netnsid 0 type xfrm dev lo if_id 23
RTNETLINK answers: Invalid argument
$ ip -n bar link ls xfrmi0
2: xfrmi0@lo:  mtu 1500 qdisc noop state DOWN mode DEFAULT group 
default qlen 1000
link/none 00:00:00:00:00:00 brd 00:00:00:00:00:00

Here is the backtrace:
[   79.879174] WARNING: CPU: 0 PID: 1178 at net/core/dev.c:8172 
rollback_registered_many+0x86/0x3c1
[   79.880260] Modules linked in: xfrm_interface nfsv3 nfs_acl auth_rpcgss 
nfsv4 nfs lockd grace sunrpc fscache button parport_pc parport serio_raw evdev 
pcspkr loop ext4 crc16 mbcache jbd2 crc32c_generic ide_cd_mod ide_gd_mod cdrom 
ata_$
eneric ata_piix libata scsi_mod 8139too piix psmouse i2c_piix4 ide_core 8139cp 
mii i2c_core floppy
[   79.883698] CPU: 0 PID: 1178 Comm: ip Not tainted 5.2.0-rc6+ #106
[   79.884462] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1 04/01/2014
[   79.885447] RIP: 0010:rollback_registered_many+0x86/0x3c1
[   79.886120] Code: 01 e8 d7 7d c6 ff 0f 0b 48 8b 45 00 4c 8b 20 48 8d 58 90 
49 83 ec 70 48 8d 7b 70 48 39 ef 74 44 8a 83 d0 04 00 00 84 c0 75 1f <0f> 0b e8 
61 cd ff ff 48 b8 00 01 00 00 00 00 ad de 48 89 43 70 66
[   79.888667] RSP: 0018:c900015ab740 EFLAGS: 00010246
[   79.889339] RAX: 8882353e5700 RBX: 8882353e56a0 RCX: 8882353e5710
[   79.890174] RDX: c900015ab7e0 RSI: c900015ab7e0 RDI: 8882353e5710
[   79.891029] RBP: c900015ab7e0 R08: c900015ab7e0 R09: c900015ab7e0
[   79.891866] R10: c900015ab7a0 R11: 82233fec R12: c900015ab770
[   79.892728] R13: 81eb7ec0 R14: 88822ed6cf00 R15: ffea
[   79.893557] FS:  7ff350f31740() GS:888237a0() 
knlGS:
[   79.894581] CS:  0010 DS:  ES:  CR0: 80050033
[   79.895317] CR2: 006c8580 CR3: 00022c272000 CR4: 06f0
[   79.896137] Call Trace:
[   79.896464]  unregister_netdevice_many+0x12/0x6c
[   79.896998]  __rtnl_newlink+0x6e2/0x73b
[   79.897446]  ? __kmalloc_node_track_caller+0x15e/0x185
[   79.898039]  ? pskb_expand_head+0x5f/0x1fe
[   79.898556]  ? stack_access_ok+0xd/0x2c
[   79.899009]  ? deref_stack_reg+0x12/0x20
[   79.899462]  ? stack_access_ok+0xd/0x2c
[   79.899927]  ? stack_access_ok+0xd/0x2c
[   79.900404]  ? __module_text_address+0x9/0x4f
[   79.900910]  ? is_bpf_text_address+0x5/0xc
[   79.901390]  ? kernel_text_address+0x67/0x7b
[   79.901884]  ? __kernel_text_address+0x1a/0x25
[   79.902397]  ? unwind_get_return_address+0x12/0x23
[   79.903122]  ? __cmpxchg_double_slab.isra.37+0x46/0x77
[   79.903772]  rtnl_newlink+0x43/0x56
[   79.904217]  rtnetlink_rcv_msg+0x200/0x24c

In fact, each time a xfrm interface was created, a netdev was allocated
by __rtnl_newlink()/rtnl_create_link() and then another one by
xfrmi_newlink()/xfrmi_create(). Only the second one was registered, it's
why the previous commands produce a backtrace: dev_change_net_namespace()
was called on a netdev with reg_state set to NETREG_UNINITIALIZED (the
first one).

CC: Lorenzo Colitti 
CC: Benedict Wong 
CC: Steffen Klassert 
CC: Shannon Nelson 
CC: Antony Antony 
CC: Eyal Birger 
Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces")
Reported-by: Julien Floret 
Signed-off-by: Nicolas Dichtel 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_interface.c | 98 +++
 1 file changed, 28 insertions(+), 70 deletions(-)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index ad3a2555c517..7dbe0c608df5 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -133,7 +133,7 @@ static void xfrmi_dev_free(struct net_device *dev)
free_percpu(dev->tstats);
 }
 
-static int xfrmi_create2(struct net_device *dev)
+static int xfrmi_create(struct net_device *dev)
 {
struct xfrm_if *xi = netdev_priv(dev);
struct net *net = dev_net(dev);
@@ -156,54 +156,7 @@ static int xfrmi_create2(struct net_device *dev)
return err;
 }
 
-static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
-{
-   struct net_device *dev;
-   struct xfrm_if *xi;
-   char name[IFNAMSIZ];
-   int err;
-
-   if (p->name[0]) {
-   strlcpy(name, p->name, IFNAMSIZ);
-   } else {
-   err = -EINVAL;
-   goto failed;
-   }
-
-   dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, 
xfrmi_dev_setup);
-   if (!dev) {
-   err = -EAGAIN;
-   goto failed;
-   }
-
-   dev_net_set(dev, net);
-
-   xi = netdev_priv(dev);
-   xi->p = *p;
-   xi->net = net;
-   xi->dev = dev;
-   xi->phydev = dev_get_by_index(net, p->link);
-   if (!xi->phydev) {
-   err = -ENODEV;
-   go

[PATCH 1/7] xfrm: Fix xfrm sel prefix length validation

2019-07-05 Thread Steffen Klassert

From: Anirudh Gupta 

Family of src/dst can be different from family of selector src/dst.
Use xfrm selector family to validate address prefix length,
while verifying new sa from userspace.

Validated patch with this command:
ip xfrm state add src 1.1.6.1 dst 1.1.6.2 proto esp spi 4260196 \
reqid 20004 mode tunnel aead "rfc4106(gcm(aes))" \
0x01640001 128 \
sel src 1011:1:4::2/128 sel dst 1021:1:4::2/128 dev Port5

Fixes: 07bf7908950a ("xfrm: Validate address prefix lengths in the xfrm 
selector.")
Signed-off-by: Anirudh Gupta 
Acked-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_user.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index eb8d14389601..74a3d1e0ff63 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -150,6 +150,22 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 
err = -EINVAL;
switch (p->family) {
+   case AF_INET:
+   break;
+
+   case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+   break;
+#else
+   err = -EAFNOSUPPORT;
+   goto out;
+#endif
+
+   default:
+   goto out;
+   }
+
+   switch (p->sel.family) {
case AF_INET:
if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
goto out;
-- 
2.17.1

pull request (net): ipsec 2019-07-05

2019-07-05 Thread Steffen Klassert

1)  Fix xfrm selector prefix length validation for
inter address family tunneling.
From Anirudh Gupta.

2) Fix a memleak in pfkey.
   From Jeremy Sowden.

3) Fix SA selector validation to allow empty selectors again.
   From Nicolas Dichtel.

4) Select crypto ciphers for xfrm_algo, this fixes some
   randconfig builds. From Arnd Bergmann.

5) Remove a duplicated assignment in xfrm_bydst_resize.
   From Cong Wang.

6) Fix a hlist corruption on hash rebuild.
   From Florian Westphal.

7) Fix a memory leak when creating xfrm interfaces.
   From Nicolas Dichtel.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit af8f3fb7fb077c9df9fed97113a031e792163def:

  net: stmmac: dma channel control register need to be init first (2019-05-20 
20:55:39 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git master

for you to fetch changes up to 56c5ee1a5823e9cf5288b84ae6364cb4112f8225:

  xfrm interface: fix memory leak on creation (2019-07-03 10:53:06 +0200)


Anirudh Gupta (1):
  xfrm: Fix xfrm sel prefix length validation

Arnd Bergmann (1):
  ipsec: select crypto ciphers for xfrm_algo

Cong Wang (1):
  xfrm: remove a duplicated assignment

Florian Westphal (1):
  xfrm: policy: fix bydst hlist corruption on hash rebuild

Jeremy Sowden (1):
  af_key: fix leaks in key_pol_get_resp and dump_sp.

Nicolas Dichtel (2):
  xfrm: fix sa selector validation
  xfrm interface: fix memory leak on creation

 net/key/af_key.c   |  8 ++-
 net/xfrm/Kconfig   |  2 +
 net/xfrm/xfrm_interface.c  | 98 +-
 net/xfrm/xfrm_policy.c | 15 +++--
 net/xfrm/xfrm_user.c   | 19 ++
 tools/testing/selftests/net/xfrm_policy.sh | 27 +++-
 6 files changed, 88 insertions(+), 81 deletions(-)

[PATCH 2/7] af_key: fix leaks in key_pol_get_resp and dump_sp.

2019-07-05 Thread Steffen Klassert

From: Jeremy Sowden 

In both functions, if pfkey_xfrm_policy2msg failed we leaked the newly
allocated sk_buff.  Free it on error.

Fixes: 55569ce256ce ("Fix conversion between IPSEC_MODE_xxx and XFRM_MODE_xxx.")
Reported-by: syzbot+4f0529365f7f2208d...@syzkaller.appspotmail.com
Signed-off-by: Jeremy Sowden 
Signed-off-by: Steffen Klassert 
---
 net/key/af_key.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 4af1e1d60b9f..51c0f10bb131 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2442,8 +2442,10 @@ static int key_pol_get_resp(struct sock *sk, struct 
xfrm_policy *xp, const struc
goto out;
}
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
-   if (err < 0)
+   if (err < 0) {
+   kfree_skb(out_skb);
goto out;
+   }
 
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = hdr->sadb_msg_version;
@@ -2694,8 +2696,10 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int 
count, void *ptr)
return PTR_ERR(out_skb);
 
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
-   if (err < 0)
+   if (err < 0) {
+   kfree_skb(out_skb);
return err;
+   }
 
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = pfk->dump.msg_version;
-- 
2.17.1

Re: [PATCH net-next 8/8] net: mscc: PTP Hardware Clock (PHC) support

2019-07-05 Thread Antoine Tenart

Hello Eric,

On Mon, Jul 01, 2019 at 05:54:41PM +0200, Eric Dumazet wrote:
> On 7/1/19 8:12 AM, Willem de Bruijn wrote:
> > On Mon, Jul 1, 2019 at 6:05 AM Antoine Tenart
> >  wrote:
> >>
> >>  void ocelot_deinit(struct ocelot *ocelot)
> >>  {
> >> +   struct ocelot_port *port;
> >> +   struct ocelot_skb *entry;
> >> +   struct list_head *pos;
> >> +   int i;
> >> +
> >> destroy_workqueue(ocelot->stats_queue);
> >> mutex_destroy(&ocelot->stats_lock);
> >> ocelot_ace_deinit();
> >> +
> >> +   for (i = 0; i < ocelot->num_phys_ports; i++) {
> >> +   port = ocelot->ports[i];
> >> +
> >> +   list_for_each(pos, &port->skbs) {
> >> +   entry = list_entry(pos, struct ocelot_skb, head);
> >> +
> >> +   list_del(pos);
> > 
> > list_for_each_safe
> 
> Also entry->skb seems to be leaked ?
> 
> dev_kfree_skb_any(entry->skb) seems to be needed

That's right, thanks for spotting this!

Thanks,
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Re: [PATCH net-next 01/16] qlge: Remove irq_cnt

2019-07-05 Thread Benjamin Poirier

On 2019/06/26 13:21, Manish Chopra wrote:
> > In msix mode there's no need to explicitly disable completion interrupts, 
> > they
> > are reliably auto-masked, according to my observations.
> > I tested this on two QLE8142 adapters.
> > 
> > Do you have reason to believe this might not always be the case?
> 
> How did you check auto-masking of MSI-X interrupts ?
> I was just wondering about the below comment in 
> ql_disable_completion_interrupt(), where for MSI-X it does disable completion 
> intr for zeroth intr.
> Seems special case for zeroth intr in MSI-X particular to this device.
> 
> /* HW disables for us if we're MSIX multi interrupts and
>  * it's not the default (zeroeth) interrupt.
>  */
> if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags) && intr))
> return 0;
> 

I checked again and arrived at the same conclusion: in msix mode,
completion interrupts are masked automatically and the adapter does not
raise interrupts until they are enabled at the end of napi polling. That
includes queue 0.

I checked by adding some tracepoints and sending traffic using pktgen.
All udp traffic goes to queue 0 with qlge. Over a 100s interval I got
2970339 q0 interrupts. In all cases, INTR_EN_EN was unset for q0.
Moreover, there were no interrupts that were raised while we were sure
that interrupts were expected to be disabled. I also tested with icmp
and multiple streams of tcp traffic and got similar results.

The driver patch for tracing as well as the analysis script are at the
bottom of this mail. I use them like so:
root@dtest:~# trace-cmd record -C global -b 100 -s 100 -e 
qlge:compirq_* -f "intr == 0" -e qlge:q0_intr sleep 100
[...]
root@dtest:~# trace-cmd report -l | ./report.awk | awk '{print $1}' | sort | 
uniq -c

It took me a few days to reply because while doing that testing I
actually found another problem. It is present before this patch set. In
INTx mode, ql_disable_completion_interrupt() does not immediately
prevent the adapter from raising interrupts. Redoing a similar test as
the previous one while forcing INTx mode via qlge_irq_type=2, I get
something like this:
4966280 0x4300
   6565 0xc300
 137749 def_bad
   7094 ISR1_0

First, we can see what I already wrote in this patch:
+   /* Experience shows that when using INTx interrupts, the device does
+* not always auto-mask the interrupt.
(The 0xc300 values include INTR_EN_EN)
Second, we can see 137749 instances of interrupts while we were
expecting interrupt generation to be disabled.

If I disable interrupts using INTR_EN_EI instead, I get something like
this:
4672919 0x4300
 75 0xc300
  2 ISR1_0

I'll be including a patch for this in the next iteration of this
patchset.

 report.awk 
#!/usr/bin/awk -f

BEGIN {
enabled = -1;
}

/compirq_enable_b/ {
enabled = 1;
next;
}

/compirq_enable_a/ {
enabled = 2;
next;
}

/q0_intr/ {
# INTR_EN
print $10;

if ($14 == "0x") {
print "ISR1_0";
}

if (enabled == 0) {
printf "def_bad "
print $3;
} else if (enabled == 1) {
printf "maybe_bad "
print $3;
}
# at this point we expect the irq to be masked, either automatically
# or explicitely
enabled = 0;
next;
}

 driver patch 

diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c 
b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 9a99e0938f08..ab306963eef1 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -43,6 +43,9 @@
 
 #include "qlge.h"
 
+#define CREATE_TRACE_POINTS
+#include "qlge_trace.h"
+
 char qlge_driver_name[] = DRV_NAME;
 const char qlge_driver_version[] = DRV_VERSION;
 
@@ -641,16 +644,20 @@ u32 ql_enable_completion_interrupt(struct ql_adapter 
*qdev, u32 intr)
/* Always enable if we're MSIX multi interrupts and
 * it's not the default (zeroeth) interrupt.
 */
+   trace_compirq_enable_b(qdev, intr);
ql_write32(qdev, INTR_EN,
   ctx->intr_en_mask);
+   trace_compirq_enable_a(qdev, intr);
var = ql_read32(qdev, STS);
return var;
}
 
spin_lock_irqsave(&qdev->hw_lock, hw_flags);
if (atomic_dec_and_test(&ctx->irq_cnt)) {
+   trace_compirq_enable_b(qdev, intr);
ql_write32(qdev, INTR_EN,
   ctx->intr_en_mask);
+   trace_compirq_enable_a(qdev, intr);
var = ql_read32(qdev, STS);
}
spin_unlock_irqrestore(&qdev->hw_lock, hw_flags);
@@ -671,8 +678,10 @@ static u32 ql_disable_completion_interrupt(struct 
ql_adapter *qdev, u32 intr)
ctx = qdev->intr_context + intr;
spin_lock(&qdev->hw_lock);

[PATCH 6/9] xfrm: remove type and offload_type map from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

Only a handful of xfrm_types exist, no need to have 512 pointers for them.

Reduces size of afinfo struct from 4k to 120 bytes on 64bit platforms.

Also, the unregister function doesn't need to return an error, no single
caller does anything useful with it.

Just place a WARN_ON() where needed instead.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h  |  16 +++-
 net/ipv4/ah4.c  |   3 +-
 net/ipv4/esp4.c |   3 +-
 net/ipv4/esp4_offload.c |   4 +-
 net/ipv4/ipcomp.c   |   3 +-
 net/ipv4/xfrm4_tunnel.c |   3 +-
 net/ipv6/ah6.c  |   4 +-
 net/ipv6/esp6.c |   3 +-
 net/ipv6/esp6_offload.c |   4 +-
 net/ipv6/ipcomp6.c  |   3 +-
 net/ipv6/mip6.c |   6 +-
 net/xfrm/xfrm_state.c   | 179 
 12 files changed, 150 insertions(+), 81 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 812994ad49ac..56b31676e330 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -348,8 +348,16 @@ int __xfrm_state_delete(struct xfrm_state *x);
 struct xfrm_state_afinfo {
u8  family;
u8  proto;
-   const struct xfrm_type  *type_map[IPPROTO_MAX];
-   const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
+
+   const struct xfrm_type_offload *type_offload_esp;
+
+   const struct xfrm_type  *type_esp;
+   const struct xfrm_type  *type_ipip;
+   const struct xfrm_type  *type_ipip6;
+   const struct xfrm_type  *type_comp;
+   const struct xfrm_type  *type_ah;
+   const struct xfrm_type  *type_routing;
+   const struct xfrm_type  *type_dstopts;
 
int (*output)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
int (*output_finish)(struct sock *sk, struct 
sk_buff *skb);
@@ -401,7 +409,7 @@ struct xfrm_type {
 };
 
 int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
-int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);
+void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family);
 
 struct xfrm_type_offload {
char*description;
@@ -413,7 +421,7 @@ struct xfrm_type_offload {
 };
 
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned 
short family);
-int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, 
unsigned short family);
+void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, 
unsigned short family);
 
 static inline int xfrm_af2proto(unsigned int family)
 {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 9c3afd550612..974179b3b314 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -590,8 +590,7 @@ static void __exit ah4_fini(void)
 {
if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
-   if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
-   pr_info("%s: can't remove xfrm type\n", __func__);
+   xfrm_unregister_type(&ah_type, AF_INET);
 }
 
 module_init(ah4_init);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b9ae95576084..c06562aded11 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1066,8 +1066,7 @@ static void __exit esp4_fini(void)
 {
if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
-   if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
-   pr_info("%s: can't remove xfrm type\n", __func__);
+   xfrm_unregister_type(&esp_type, AF_INET);
 }
 
 module_init(esp4_init);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 8edcfa66d1e5..6e5288aef71e 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -315,9 +315,7 @@ static int __init esp4_offload_init(void)
 
 static void __exit esp4_offload_exit(void)
 {
-   if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0)
-   pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+   xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
inet_del_offload(&esp4_offload, IPPROTO_ESP);
 }
 
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 9119d012ba46..ee03f0a55152 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -190,8 +190,7 @@ static void __exit ipcomp4_fini(void)
 {
if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
-   if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
-   pr_info("%s: can't remove xfrm type\n", __func__);
+   xfrm_unregister_type(&ipcomp_type, AF_INET);
 }
 
 module_init(ipcomp4_init);
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 5d00e54cd319..dc19aff

[PATCH 9/9] xfrm: remove get_mtu indirection from xfrm_type

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

esp4_get_mtu and esp6_get_mtu are exactly the same, the only difference
is a single sizeof() (ipv4 vs. ipv6 header).

Merge both into xfrm_state_mtu() and remove the indirection.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  4 +---
 net/ipv4/esp4.c| 27 +--
 net/ipv6/esp6.c| 20 +---
 net/xfrm/xfrm_device.c |  5 ++---
 net/xfrm/xfrm_state.c  | 34 +-
 5 files changed, 34 insertions(+), 56 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 56b31676e330..b22db30c3d88 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -404,8 +404,6 @@ struct xfrm_type {
int (*reject)(struct xfrm_state *, struct sk_buff *,
  const struct flowi *);
int (*hdr_offset)(struct xfrm_state *, struct 
sk_buff *, u8 **);
-   /* Estimate maximal size of result of transformation of a dgram */
-   u32 (*get_mtu)(struct xfrm_state *, int size);
 };
 
 int xfrm_register_type(const struct xfrm_type *type, unsigned short family);
@@ -1546,7 +1544,7 @@ void xfrm_sad_getinfo(struct net *net, struct 
xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
 int xfrm_init_replay(struct xfrm_state *x);
-int xfrm_state_mtu(struct xfrm_state *x, int mtu);
+u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
 int xfrm_init_state(struct xfrm_state *x);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index c06562aded11..5c967764041f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,8 +33,6 @@ struct esp_output_extra {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
-
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -506,7 +504,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff 
*skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
 
-   padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+   padto = min(x->tfcpad, xfrm_state_mtu(x, 
dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -788,28 +786,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff 
*skb)
return err;
 }
 
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
-{
-   struct crypto_aead *aead = x->data;
-   u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   unsigned int net_adj;
-
-   switch (x->props.mode) {
-   case XFRM_MODE_TRANSPORT:
-   case XFRM_MODE_BEET:
-   net_adj = sizeof(struct iphdr);
-   break;
-   case XFRM_MODE_TUNNEL:
-   net_adj = 0;
-   break;
-   default:
-   BUG();
-   }
-
-   return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
-net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
 static int esp4_err(struct sk_buff *skb, u32 info)
 {
struct net *net = dev_net(skb->dev);
@@ -1035,7 +1011,6 @@ static const struct xfrm_type esp_type =
.flags  = XFRM_TYPE_REPLAY_PROT,
.init_state = esp_init_state,
.destructor = esp_destroy,
-   .get_mtu= esp4_get_mtu,
.input  = esp_input,
.output = esp_output,
 };
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index b6c6b3e08836..a3b403ba8f8f 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -41,8 +41,6 @@ struct esp_skb_cb {
 
 #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
 
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
-
 /*
  * Allocate an AEAD request structure with extra space for SG and IV.
  *
@@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
 
-   padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+   padto = min(x->tfcpad, xfrm_state_mtu(x, 
dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -687,21 +685,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff 
*skb)
return ret;
 }
 
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
-{
-   struct crypto_aead *aead = x->data;
-   u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-   unsigned int net_adj;
-
-   if (x->props.mode != XFRM_MODE_TUNNEL)
-   net_ad

[PATCH 7/9] xfrm: fix bogus WARN_ON with ipv6

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

net/xfrm/xfrm_input.c:378:17: warning: this statement may fall through 
[-Wimplicit-fallthrough=]
skb->protocol = htons(ETH_P_IPV6);

... the fallthrough then causes a bogus WARN_ON().

Reported-by: Stephen Rothwell 
Fixes: 4c203b0454b ("xfrm: remove eth_proto value from xfrm_state_afinfo")
Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_input.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 8a00cc94c32c..6088bc2dc11e 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -376,6 +376,7 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct 
sk_buff *skb)
break;
case AF_INET6:
skb->protocol = htons(ETH_P_IPV6);
+   break;
default:
WARN_ON_ONCE(1);
break;
-- 
2.17.1

[PATCH 3/9] xfrm: remove init_flags indirection from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

There is only one implementation of this function; just call it directly.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  1 -
 net/ipv4/xfrm4_state.c |  8 
 net/xfrm/xfrm_state.c  | 17 +++--
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e8f676ce27be..61214f5c3205 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -353,7 +353,6 @@ struct xfrm_state_afinfo {
const struct xfrm_type  *type_map[IPPROTO_MAX];
const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
 
-   int (*init_flags)(struct xfrm_state *x);
int (*tmpl_sort)(struct xfrm_tmpl **dst, struct 
xfrm_tmpl **src, int n);
int (*state_sort)(struct xfrm_state **dst, struct 
xfrm_state **src, int n);
int (*output)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 018448e222af..62c96da38b4e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -15,13 +15,6 @@
 #include 
 #include 
 
-static int xfrm4_init_flags(struct xfrm_state *x)
-{
-   if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
-   x->props.flags |= XFRM_STATE_NOPMTUDISC;
-   return 0;
-}
-
 int xfrm4_extract_header(struct sk_buff *skb)
 {
const struct iphdr *iph = ip_hdr(skb);
@@ -43,7 +36,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.proto  = IPPROTO_IPIP,
.eth_proto  = htons(ETH_P_IP),
.owner  = THIS_MODULE,
-   .init_flags = xfrm4_init_flags,
.output = xfrm4_output,
.output_finish  = xfrm4_output_finish,
.extract_input  = xfrm4_extract_input,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 336d3f6a1a51..5c13a8021d4c 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2263,25 +2263,14 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 {
-   const struct xfrm_state_afinfo *afinfo;
const struct xfrm_mode *inner_mode;
const struct xfrm_mode *outer_mode;
int family = x->props.family;
int err;
 
-   err = -EAFNOSUPPORT;
-   afinfo = xfrm_state_get_afinfo(family);
-   if (!afinfo)
-   goto error;
-
-   err = 0;
-   if (afinfo->init_flags)
-   err = afinfo->init_flags(x);
-
-   rcu_read_unlock();
-
-   if (err)
-   goto error;
+   if (family == AF_INET &&
+   xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
+   x->props.flags |= XFRM_STATE_NOPMTUDISC;
 
err = -EPROTONOSUPPORT;
 
-- 
2.17.1

pull request (net-next): ipsec-next 2019-07-05

2019-07-05 Thread Steffen Klassert

1) A lot of work to remove indirections from the xfrm code.
   From Florian Westphal.

2) Fix a WARN_ON with ipv6 that triggered because of a
   forgotten break statement. From Florian Westphal.

3)  Remove xfrmi_init_net, it is not needed.
From Li RongQing.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit 2a99283cb7c1ef1bc61770a2a20ef88693687443:

  Merge branch 'net-dsa-mv88e6xxx-support-for-mv88e6250' (2019-06-04 20:07:57 
-0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master

for you to fetch changes up to c7b37c769d2a5e711106a3c793140a4f46768e04:

  xfrm: remove get_mtu indirection from xfrm_type (2019-07-01 06:16:40 +0200)


Florian Westphal (8):
  xfrm: remove init_tempsel indirection from xfrm_state_afinfo
  xfrm: remove init_temprop indirection from xfrm_state_afinfo
  xfrm: remove init_flags indirection from xfrm_state_afinfo
  xfrm: remove state and template sort indirections from xfrm_state_afinfo
  xfrm: remove eth_proto value from xfrm_state_afinfo
  xfrm: remove type and offload_type map from xfrm_state_afinfo
  xfrm: fix bogus WARN_ON with ipv6
  xfrm: remove get_mtu indirection from xfrm_type

Li RongQing (1):
  xfrm: remove empty xfrmi_init_net

 include/net/xfrm.h|  53 +++---
 net/ipv4/ah4.c|   3 +-
 net/ipv4/esp4.c   |  30 +---
 net/ipv4/esp4_offload.c   |   4 +-
 net/ipv4/ipcomp.c |   3 +-
 net/ipv4/xfrm4_state.c|  45 -
 net/ipv4/xfrm4_tunnel.c   |   3 +-
 net/ipv6/ah6.c|   4 +-
 net/ipv6/esp6.c   |  23 +--
 net/ipv6/esp6_offload.c   |   4 +-
 net/ipv6/ipcomp6.c|   3 +-
 net/ipv6/mip6.c   |   6 +-
 net/ipv6/xfrm6_state.c| 137 ---
 net/xfrm/xfrm_device.c|   5 +-
 net/xfrm/xfrm_input.c |  25 +--
 net/xfrm/xfrm_interface.c |   6 -
 net/xfrm/xfrm_policy.c|   2 +-
 net/xfrm/xfrm_state.c | 437 ++
 18 files changed, 381 insertions(+), 412 deletions(-)

[PATCH 2/9] xfrm: remove init_temprop indirection from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

same as previous patch: just place this in the caller, no need to
have an indirection for a structure initialization.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  4 
 net/ipv4/xfrm4_state.c | 16 
 net/ipv6/xfrm6_state.c | 16 
 net/xfrm/xfrm_state.c  | 27 ---
 4 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ba65434b5293..e8f676ce27be 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -354,10 +354,6 @@ struct xfrm_state_afinfo {
const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
 
int (*init_flags)(struct xfrm_state *x);
-   void(*init_temprop)(struct xfrm_state *x,
-   const struct xfrm_tmpl *tmpl,
-   const xfrm_address_t *daddr,
-   const xfrm_address_t *saddr);
int (*tmpl_sort)(struct xfrm_tmpl **dst, struct 
xfrm_tmpl **src, int n);
int (*state_sort)(struct xfrm_state **dst, struct 
xfrm_state **src, int n);
int (*output)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index da0fd9556d57..018448e222af 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -22,21 +22,6 @@ static int xfrm4_init_flags(struct xfrm_state *x)
return 0;
 }
 
-static void
-xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
-  const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
-   x->id = tmpl->id;
-   if (x->id.daddr.a4 == 0)
-   x->id.daddr.a4 = daddr->a4;
-   x->props.saddr = tmpl->saddr;
-   if (x->props.saddr.a4 == 0)
-   x->props.saddr.a4 = saddr->a4;
-   x->props.mode = tmpl->mode;
-   x->props.reqid = tmpl->reqid;
-   x->props.family = AF_INET;
-}
-
 int xfrm4_extract_header(struct sk_buff *skb)
 {
const struct iphdr *iph = ip_hdr(skb);
@@ -59,7 +44,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.eth_proto  = htons(ETH_P_IP),
.owner  = THIS_MODULE,
.init_flags = xfrm4_init_flags,
-   .init_temprop   = xfrm4_init_temprop,
.output = xfrm4_output,
.output_finish  = xfrm4_output_finish,
.extract_input  = xfrm4_extract_input,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 0e19ded3e33b..aa5d2c52cc31 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,21 +21,6 @@
 #include 
 #include 
 
-static void
-xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
-  const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
-   x->id = tmpl->id;
-   if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
-   memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
-   memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
-   if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
-   memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
-   x->props.mode = tmpl->mode;
-   x->props.reqid = tmpl->reqid;
-   x->props.family = AF_INET6;
-}
-
 /* distribution counting sort function for xfrm_state and xfrm_tmpl */
 static int
 __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
@@ -153,7 +138,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.proto  = IPPROTO_IPV6,
.eth_proto  = htons(ETH_P_IPV6),
.owner  = THIS_MODULE,
-   .init_temprop   = xfrm6_init_temprop,
.tmpl_sort  = __xfrm6_tmpl_sort,
.state_sort = __xfrm6_state_sort,
.output = xfrm6_output,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 66d9009fe9b5..336d3f6a1a51 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -812,8 +812,6 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct 
flowi *fl,
const xfrm_address_t *daddr, const xfrm_address_t *saddr,
unsigned short family)
 {
-   struct xfrm_state_afinfo *afinfo = xfrm_state_afinfo_get_rcu(family);
-
switch (family) {
case AF_INET:
__xfrm4_init_tempsel(&x->sel, fl);
@@ -823,13 +821,28 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct 
flowi *fl,
break;
}
 
-   if (family != tmpl->encap_family)
-   afinfo = xfrm_state_afinfo_get_rcu(tmpl->encap_family);
+   x->id = tmpl->id;
 
-   if (!afinfo)
-   return;
+   switch (tmpl->encap

[PATCH 5/9] xfrm: remove eth_proto value from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

xfrm_prepare_input needs to lookup the state afinfo backend again to fetch
the address family ethernet protocol value.

There are only two address families, so a switch statement is simpler.
While at it, use u8 for family and proto and remove the owner member --
its not used anywhere.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  6 ++
 net/ipv4/xfrm4_state.c |  2 --
 net/ipv6/xfrm6_state.c |  2 --
 net/xfrm/xfrm_input.c  | 24 
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4325cb708ed4..812994ad49ac 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -346,10 +346,8 @@ void km_state_expired(struct xfrm_state *x, int hard, u32 
portid);
 int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
-   unsigned intfamily;
-   unsigned intproto;
-   __be16  eth_proto;
-   struct module   *owner;
+   u8  family;
+   u8  proto;
const struct xfrm_type  *type_map[IPPROTO_MAX];
const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
 
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 62c96da38b4e..f8ed3c3bb928 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -34,8 +34,6 @@ int xfrm4_extract_header(struct sk_buff *skb)
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
.proto  = IPPROTO_IPIP,
-   .eth_proto  = htons(ETH_P_IP),
-   .owner  = THIS_MODULE,
.output = xfrm4_output,
.output_finish  = xfrm4_output_finish,
.extract_input  = xfrm4_extract_input,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 1782ebb22dd3..78daadecbdef 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -40,8 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb)
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.family = AF_INET6,
.proto  = IPPROTO_IPV6,
-   .eth_proto  = htons(ETH_P_IPV6),
-   .owner  = THIS_MODULE,
.output = xfrm6_output,
.output_finish  = xfrm6_output_finish,
.extract_input  = xfrm6_extract_input,
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 314973aaa414..8a00cc94c32c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -359,28 +359,28 @@ static int xfrm_prepare_input(struct xfrm_state *x, 
struct sk_buff *skb)
afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family);
if (likely(afinfo))
err = afinfo->extract_input(x, skb);
+   rcu_read_unlock();
 
-   if (err) {
-   rcu_read_unlock();
+   if (err)
return err;
-   }
 
if (x->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(x, 
XFRM_MODE_SKB_CB(skb)->protocol);
-   if (!inner_mode) {
-   rcu_read_unlock();
+   if (!inner_mode)
return -EAFNOSUPPORT;
-   }
}
 
-   afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
-   if (unlikely(!afinfo)) {
-   rcu_read_unlock();
-   return -EAFNOSUPPORT;
+   switch (inner_mode->family) {
+   case AF_INET:
+   skb->protocol = htons(ETH_P_IP);
+   break;
+   case AF_INET6:
+   skb->protocol = htons(ETH_P_IPV6);
+   default:
+   WARN_ON_ONCE(1);
+   break;
}
 
-   skb->protocol = afinfo->eth_proto;
-   rcu_read_unlock();
return xfrm_inner_mode_encap_remove(x, inner_mode, skb);
 }
 
-- 
2.17.1

[PATCH 4/9] xfrm: remove state and template sort indirections from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

No module dependency, placing this in xfrm_state.c avoids need for
an indirection.

This also removes the state spinlock -- I don't see why we would need
to hold it during sorting.

This in turn allows to remove the 'net' argument passed to
xfrm_tmpl_sort.  Last, remove the EXPORT_SYMBOL, there are no modular
callers.

For the CONFIG_IPV6=m case, vmlinux size increase is about 300 byte.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  18 +++---
 net/ipv6/xfrm6_state.c |  98 --
 net/xfrm/xfrm_policy.c |   2 +-
 net/xfrm/xfrm_state.c  | 132 -
 4 files changed, 113 insertions(+), 137 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 61214f5c3205..4325cb708ed4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -353,8 +353,6 @@ struct xfrm_state_afinfo {
const struct xfrm_type  *type_map[IPPROTO_MAX];
const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
 
-   int (*tmpl_sort)(struct xfrm_tmpl **dst, struct 
xfrm_tmpl **src, int n);
-   int (*state_sort)(struct xfrm_state **dst, struct 
xfrm_state **src, int n);
int (*output)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
int (*output_finish)(struct sock *sk, struct 
sk_buff *skb);
int (*extract_input)(struct xfrm_state *x,
@@ -1501,21 +1499,19 @@ struct xfrm_state *xfrm_state_lookup_byaddr(struct net 
*net, u32 mark,
u8 proto,
unsigned short family);
 #ifdef CONFIG_XFRM_SUB_POLICY
-int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
-  unsigned short family, struct net *net);
-int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
unsigned short family);
+void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+unsigned short family);
 #else
-static inline int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl 
**src,
-int n, unsigned short family, struct net *net)
+static inline void xfrm_tmpl_sort(struct xfrm_tmpl **d, struct xfrm_tmpl **s,
+ int n, unsigned short family)
 {
-   return -ENOSYS;
 }
 
-static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state 
**src,
- int n, unsigned short family)
+static inline void xfrm_state_sort(struct xfrm_state **d, struct xfrm_state 
**s,
+  int n, unsigned short family)
 {
-   return -ENOSYS;
 }
 #endif
 
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index aa5d2c52cc31..1782ebb22dd3 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,102 +21,6 @@
 #include 
 #include 
 
-/* distribution counting sort function for xfrm_state and xfrm_tmpl */
-static int
-__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
-{
-   int count[XFRM_MAX_DEPTH] = { };
-   int class[XFRM_MAX_DEPTH];
-   int i;
-
-   for (i = 0; i < n; i++) {
-   int c;
-   class[i] = c = cmp(src[i]);
-   count[c]++;
-   }
-
-   for (i = 2; i < maxclass; i++)
-   count[i] += count[i - 1];
-
-   for (i = 0; i < n; i++) {
-   dst[count[class[i] - 1]++] = src[i];
-   src[i] = NULL;
-   }
-
-   return 0;
-}
-
-/*
- * Rule for xfrm_state:
- *
- * rule 1: select IPsec transport except AH
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec transport AH
- * rule 4: select IPsec tunnel
- * rule 5: others
- */
-static int __xfrm6_state_sort_cmp(void *p)
-{
-   struct xfrm_state *v = p;
-
-   switch (v->props.mode) {
-   case XFRM_MODE_TRANSPORT:
-   if (v->id.proto != IPPROTO_AH)
-   return 1;
-   else
-   return 3;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
-   case XFRM_MODE_ROUTEOPTIMIZATION:
-   case XFRM_MODE_IN_TRIGGER:
-   return 2;
-#endif
-   case XFRM_MODE_TUNNEL:
-   case XFRM_MODE_BEET:
-   return 4;
-   }
-   return 5;
-}
-
-static int
-__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
-{
-   return __xfrm6_sort((void **)dst, (void **)src, n,
-   __xfrm6_state_sort_cmp, 6);
-}
-
-/*
- * Rule for xfrm_tmpl:
- *
- * rule 1: select IPsec transport
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec tunnel
- * rule 4: others
- */
-static int __xfrm6_tmpl_sort_cmp(void *p)
-{
-   struct xfrm_tmpl *v = p;
-

[PATCH 8/9] xfrm: remove empty xfrmi_init_net

2019-07-05 Thread Steffen Klassert

From: Li RongQing 

Pointer members of an object with static storage duration, if not
explicitly initialized, will be initialized to a NULL pointer. The
net namespace API checks if this pointer is not NULL before using it,
it are safe to remove the function.

Signed-off-by: Li RongQing 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_interface.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index ad3a2555c517..f8eb9e342173 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -793,11 +793,6 @@ static void __net_exit xfrmi_destroy_interfaces(struct 
xfrmi_net *xfrmn)
unregister_netdevice_many(&list);
 }
 
-static int __net_init xfrmi_init_net(struct net *net)
-{
-   return 0;
-}
-
 static void __net_exit xfrmi_exit_net(struct net *net)
 {
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
@@ -808,7 +803,6 @@ static void __net_exit xfrmi_exit_net(struct net *net)
 }
 
 static struct pernet_operations xfrmi_net_ops = {
-   .init = xfrmi_init_net,
.exit = xfrmi_exit_net,
.id   = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
-- 
2.17.1

[PATCH 1/9] xfrm: remove init_tempsel indirection from xfrm_state_afinfo

2019-07-05 Thread Steffen Klassert

From: Florian Westphal 

Simple initialization, handle it in the caller.

Signed-off-by: Florian Westphal 
Signed-off-by: Steffen Klassert 
---
 include/net/xfrm.h |  2 --
 net/ipv4/xfrm4_state.c | 19 --
 net/ipv6/xfrm6_state.c | 21 
 net/xfrm/xfrm_state.c  | 56 --
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a2907873ed56..ba65434b5293 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -354,8 +354,6 @@ struct xfrm_state_afinfo {
const struct xfrm_type_offload  *type_offload_map[IPPROTO_MAX];
 
int (*init_flags)(struct xfrm_state *x);
-   void(*init_tempsel)(struct xfrm_selector *sel,
-   const struct flowi *fl);
void(*init_temprop)(struct xfrm_state *x,
const struct xfrm_tmpl *tmpl,
const xfrm_address_t *daddr,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 80c40b4981bb..da0fd9556d57 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -22,24 +22,6 @@ static int xfrm4_init_flags(struct xfrm_state *x)
return 0;
 }
 
-static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
-   const struct flowi4 *fl4 = &fl->u.ip4;
-
-   sel->daddr.a4 = fl4->daddr;
-   sel->saddr.a4 = fl4->saddr;
-   sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
-   sel->dport_mask = htons(0x);
-   sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
-   sel->sport_mask = htons(0x);
-   sel->family = AF_INET;
-   sel->prefixlen_d = 32;
-   sel->prefixlen_s = 32;
-   sel->proto = fl4->flowi4_proto;
-   sel->ifindex = fl4->flowi4_oif;
-}
-
 static void
 xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
@@ -77,7 +59,6 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.eth_proto  = htons(ETH_P_IP),
.owner  = THIS_MODULE,
.init_flags = xfrm4_init_flags,
-   .init_tempsel   = __xfrm4_init_tempsel,
.init_temprop   = xfrm4_init_temprop,
.output = xfrm4_output,
.output_finish  = xfrm4_output_finish,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 5bdca3d5d6b7..0e19ded3e33b 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -21,26 +21,6 @@
 #include 
 #include 
 
-static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
-   const struct flowi6 *fl6 = &fl->u.ip6;
-
-   /* Initialize temporary selector matching only
-* to current session. */
-   *(struct in6_addr *)&sel->daddr = fl6->daddr;
-   *(struct in6_addr *)&sel->saddr = fl6->saddr;
-   sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
-   sel->dport_mask = htons(0x);
-   sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
-   sel->sport_mask = htons(0x);
-   sel->family = AF_INET6;
-   sel->prefixlen_d = 128;
-   sel->prefixlen_s = 128;
-   sel->proto = fl6->flowi6_proto;
-   sel->ifindex = fl6->flowi6_oif;
-}
-
 static void
 xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
   const xfrm_address_t *daddr, const xfrm_address_t *saddr)
@@ -173,7 +153,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.proto  = IPPROTO_IPV6,
.eth_proto  = htons(ETH_P_IPV6),
.owner  = THIS_MODULE,
-   .init_tempsel   = __xfrm6_init_tempsel,
.init_temprop   = xfrm6_init_temprop,
.tmpl_sort  = __xfrm6_tmpl_sort,
.state_sort = __xfrm6_state_sort,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 50621d982970..66d9009fe9b5 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -769,6 +769,43 @@ void xfrm_sad_getinfo(struct net *net, struct 
xfrmk_sadinfo *si)
 }
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
+static void
+__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+   const struct flowi4 *fl4 = &fl->u.ip4;
+
+   sel->daddr.a4 = fl4->daddr;
+   sel->saddr.a4 = fl4->saddr;
+   sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
+   sel->dport_mask = htons(0x);
+   sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
+   sel->sport_mask = htons(0x);
+   sel->family = AF_INET;
+   sel->prefixlen_d = 32;
+   sel->prefixlen_s = 32;
+   sel->proto = fl4->flowi4_proto;
+   sel->ifindex = fl4->flowi4_oif;
+}
+
+static void
+__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
+{
+   const str

[PATCH v3 1/2] Documentation: net: dsa: Describe DSA switch configuration

2019-07-05 Thread Benedikt Spranger

Document DSA tagged and VLAN based switch configuration by showcases.

Signed-off-by: Benedikt Spranger 
Reviewed-by: Andrew Lunn 
---
 .../networking/dsa/configuration.rst  | 292 ++
 Documentation/networking/dsa/index.rst|   1 +
 2 files changed, 293 insertions(+)
 create mode 100644 Documentation/networking/dsa/configuration.rst

diff --git a/Documentation/networking/dsa/configuration.rst 
b/Documentation/networking/dsa/configuration.rst
new file mode 100644
index ..3260ee81cb6e
--- /dev/null
+++ b/Documentation/networking/dsa/configuration.rst
@@ -0,0 +1,292 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+DSA switch configuration from userspace
+===
+
+The DSA switch configuration is not integrated into the main userspace
+network configuration suites by now and has to be performed manualy.
+
+.. _dsa-config-showcases:
+
+Configuration showcases
+---
+
+To configure a DSA switch a couple of commands need to be executed. In this
+documentation some common configuration scenarios are handled as showcases:
+
+*single port*
+  Every switch port acts as a different configurable Ethernet port
+
+*bridge*
+  Every switch port is part of one configurable Ethernet bridge
+
+*gateway*
+  Every switch port except one upstream port is part of a configurable
+  Ethernet bridge.
+  The upstream port acts as different configurable Ethernet port.
+
+All configurations are performed with tools from iproute2, which is available
+at https://www.kernel.org/pub/linux/utils/net/iproute2/
+
+Through DSA every port of a switch is handled like a normal linux Ethernet
+interface. The CPU port is the switch port connected to an Ethernet MAC chip.
+The corresponding linux Ethernet interface is called the master interface.
+All other corresponding linux interfaces are called slave interfaces.
+
+The slave interfaces depend on the master interface. They can only brought up,
+when the master interface is up.
+
+In this documentation the following Ethernet interfaces are used:
+
+*eth0*
+  the master interface
+
+*lan1*
+  a slave interface
+
+*lan2*
+  another slave interface
+
+*lan3*
+  a third slave interface
+
+*wan*
+  A slave interface dedicated for upstream traffic
+
+Further Ethernet interfaces can be configured similar.
+The configured IPs and networks are:
+
+*single port*
+  * lan1: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+  * lan2: 192.0.2.5/30 (192.0.2.4 - 192.0.2.7)
+  * lan3: 192.0.2.9/30 (192.0.2.8 - 192.0.2.11)
+
+*bridge*
+  * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+
+*gateway*
+  * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+  * wan: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+
+.. _dsa-tagged-configuration:
+
+Configuration with tagging support
+--
+
+The tagging based configuration is desired and supported by the majority of
+DSA switches. These switches are capable to tag incoming and outgoing traffic
+without using a VLAN based configuration.
+
+single port
+~~~
+
+.. code-block:: sh
+
+  # configure each interface
+  ip addr add 192.0.2.1/30 dev lan1
+  ip addr add 192.0.2.5/30 dev lan2
+  ip addr add 192.0.2.9/30 dev lan3
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan2 up
+  ip link set lan3 up
+
+bridge
+~~
+
+.. code-block:: sh
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set lan1 up
+  ip link set lan2 up
+  ip link set lan3 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # add ports to bridge
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set dev lan3 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+gateway
+~~~
+
+.. code-block:: sh
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # configure the upstream port
+  ip addr add 192.0.2.1/30 dev wan
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # add ports to bridge
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up
+
+.. _dsa-vlan-configuration:
+
+Configuration without tagging support
+-
+
+A minority of switches are not capable to use a taging protocol
+(DSA_TAG_PROTO_NONE). These switches can be configured by a VLAN based
+configuration.
+
+single port
+~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+
+.. code-block:: sh
+
+

[PATCH v3 0/2] Document the configuration of b53

2019-07-05 Thread Benedikt Spranger

this is the third round to document the configuration of a b53 supported
switch.

Thanks for the comments.

Regards
Bene Spranger

v3..v2:
- fix a typo
- improve b53 configuration in DSA_TAG_PROTO_NONE showcase.
- grade up from RFC to patch for mainline inclusion.

v1..v2:
- split out generic parts of the configuration.
- target comments by Andrew Lunn and Florian Fainelli.
- make changes visible to build system

Benedikt Spranger (2):
  Documentation: net: dsa: Describe DSA switch configuration
  Documentation: net: dsa: b53: Describe b53 configuration

 Documentation/networking/dsa/b53.rst  | 183 +++
 .../networking/dsa/configuration.rst  | 292 ++
 Documentation/networking/dsa/index.rst|   2 +
 3 files changed, 477 insertions(+)
 create mode 100644 Documentation/networking/dsa/b53.rst
 create mode 100644 Documentation/networking/dsa/configuration.rst

-- 
2.20.1

[PATCH v3 2/2] Documentation: net: dsa: b53: Describe b53 configuration

2019-07-05 Thread Benedikt Spranger

Document the different needs of documentation for the b53 driver.

Signed-off-by: Benedikt Spranger 
---
 Documentation/networking/dsa/b53.rst   | 183 +
 Documentation/networking/dsa/index.rst |   1 +
 2 files changed, 184 insertions(+)
 create mode 100644 Documentation/networking/dsa/b53.rst

diff --git a/Documentation/networking/dsa/b53.rst 
b/Documentation/networking/dsa/b53.rst
new file mode 100644
index ..b41637cdb82b
--- /dev/null
+++ b/Documentation/networking/dsa/b53.rst
@@ -0,0 +1,183 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+Broadcom RoboSwitch Ethernet switch driver
+==
+
+The Broadcom RoboSwitch Ethernet switch family is used in quite a range of
+xDSL router, cable modems and other multimedia devices.
+
+The actual implementation supports the devices BCM5325E, BCM5365, BCM539x,
+BCM53115 and BCM53125 as well as BCM63XX.
+
+Implementation details
+==
+
+The driver is located in ``drivers/net/dsa/b53/`` and is implemented as a
+DSA driver; see ``Documentation/networking/dsa/dsa.rst`` for details on the
+subsystem and what it provides.
+
+The switch is, if possible, configured to enable a Broadcom specific 4-bytes
+switch tag which gets inserted by the switch for every packet forwarded to the
+CPU interface, conversely, the CPU network interface should insert a similar
+tag for packets entering the CPU port. The tag format is described in
+``net/dsa/tag_brcm.c``.
+
+The configuration of the device depends on whether or not tagging is
+supported.
+
+The interface names and example network configuration are used according the
+configuration described in the :ref:`dsa-config-showcases`.
+
+Configuration with tagging support
+--
+
+The tagging based configuration is desired. It is not specific to the b53
+DSA driver and will work like all DSA drivers which supports tagging.
+
+See :ref:`dsa-tagged-configuration`.
+
+Configuration without tagging support
+-
+
+Older models (5325, 5365) support a different tag format that is not supported
+yet. 539x and 531x5 require managed mode and some special handling, which is
+also not yet supported. The tagging support is disabled in these cases and the
+switch need a different configuration.
+
+The configuration slightly differ from the :ref:`dsa-vlan-configuration`.
+
+The b53 tags the CPU port in all VLANs, since otherwise any PVID untagged
+VLAN programming would basically change the CPU port's default PVID and make
+it untagged, undesirable.
+
+In difference to the configuration described in :ref:`dsa-vlan-configuration`
+the default VLAN 1 has to be removed from the slave interface configuration in
+single port and gateway configuration, while there is no need to add an extra
+VLAN configuration in the bridge showcase.
+
+single port
+~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+By default packages are tagged with vid 1:
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+  ip link add link eth0 name eth0.2 type vlan id 2
+  ip link add link eth0 name eth0.3 type vlan id 3
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+  ip link set eth0.2 up
+  ip link set eth0.3 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridges
+  ip link set dev wan master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+
+  # tag traffic on ports
+  bridge vlan add dev lan1 vid 2 pvid untagged
+  bridge vlan del dev lan1 vid 1
+  bridge vlan add dev lan2 vid 3 pvid untagged
+  bridge vlan del dev lan2 vid 1
+
+  # configure the VLANs
+  ip addr add 192.0.2.1/30 dev eth0.1
+  ip addr add 192.0.2.5/30 dev eth0.2
+  ip addr add 192.0.2.9/30 dev eth0.3
+
+  # bring up the bridge devices
+  ip link set br0 up
+
+
+bridge
+~~
+
+.. code-block:: sh
+
+  # tag traffic on CPU port
+  ip link add link eth0 name eth0.1 type vlan id 1
+
+  # The master interface needs to be brought up before the slave ports.
+  ip link set eth0 up
+  ip link set eth0.1 up
+
+  # bring up the slave interfaces
+  ip link set wan up
+  ip link set lan1 up
+  ip link set lan2 up
+
+  # create bridge
+  ip link add name br0 type bridge
+
+  # activate VLAN filtering
+  ip link set dev br0 type bridge vlan_filtering 1
+
+  # add ports to bridge
+  ip link set dev wan master br0
+  ip link set dev lan1 master br0
+  ip link set dev lan2 master br0
+  ip link set eth0.1 master br0
+
+  # configure the bridge
+  ip addr add 192.0.2.129/25 dev br0
+
+  # bring up the bridge
+  ip link set dev br0 up

[net-next, PATCH, v3] net: netsec: Sync dma for device on buffer allocation

2019-07-05 Thread Ilias Apalodimas

Quoting Arnd,
We have to do a sync_single_for_device /somewhere/ before the
buffer is given to the device. On a non-cache-coherent machine with
a write-back cache, there may be dirty cache lines that get written back
after the device DMA's data into it (e.g. from a previous memset
from before the buffer got freed), so you absolutely need to flush any
dirty cache lines on it first.

Since the coherency is configurable in this device make sure we cover
all configurations by explicitly syncing the allocated buffer for the
device before refilling it's descriptors

Signed-off-by: Ilias Apalodimas 
---
Changes since v2:
- Only sync for the portion of the packet owned by the NIC as suggested by 
  Jesper

 drivers/net/ethernet/socionext/netsec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/socionext/netsec.c 
b/drivers/net/ethernet/socionext/netsec.c
index 5544a722543f..6b954ad88842 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -727,6 +727,7 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 {
 
struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+   enum dma_data_direction dma_dir;
struct page *page;
 
page = page_pool_dev_alloc_pages(dring->page_pool);
@@ -742,6 +743,8 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 * cases and reserve enough space for headroom + skb_shared_info
 */
*desc_len = PAGE_SIZE - NETSEC_RX_BUF_NON_DATA;
+   dma_dir = page_pool_get_dma_dir(dring->page_pool);
+   dma_sync_single_for_device(priv->dev, *dma_handle, *desc_len, dma_dir);
 
return page_address(page);
 }
-- 
2.20.1

[PATCH] selftests: txring_overwrite: fix incorrect test of mmap() return value

2019-07-05 Thread Frank de Brabander

If mmap() fails it returns MAP_FAILED, which is defined as ((void *) -1).
The current if-statement incorrectly tests if *ring is NULL.

Signed-off-by: Frank de Brabander 
---
 tools/testing/selftests/net/txring_overwrite.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/txring_overwrite.c 
b/tools/testing/selftests/net/txring_overwrite.c
index fd8b1c6..7d9ea03 100644
--- a/tools/testing/selftests/net/txring_overwrite.c
+++ b/tools/testing/selftests/net/txring_overwrite.c
@@ -113,7 +113,7 @@ static int setup_tx(char **ring)
 
*ring = mmap(0, req.tp_block_size * req.tp_block_nr,
 PROT_READ | PROT_WRITE, MAP_SHARED, fdt, 0);
-   if (!*ring)
+   if (*ring == MAP_FAILED)
error(1, errno, "mmap");
 
return fdt;
-- 
2.7.4

Re: [net-next, PATCH, v3] net: netsec: Sync dma for device on buffer allocation

2019-07-05 Thread Jesper Dangaard Brouer

On Fri,  5 Jul 2019 13:47:47 +0300
Ilias Apalodimas  wrote:

> Quoting Arnd,
> We have to do a sync_single_for_device /somewhere/ before the
> buffer is given to the device. On a non-cache-coherent machine with
> a write-back cache, there may be dirty cache lines that get written back
> after the device DMA's data into it (e.g. from a previous memset
> from before the buffer got freed), so you absolutely need to flush any
> dirty cache lines on it first.
> 
> Since the coherency is configurable in this device make sure we cover
> all configurations by explicitly syncing the allocated buffer for the
> device before refilling it's descriptors
> 
> Signed-off-by: Ilias Apalodimas 
> ---
> Changes since v2:
> - Only sync for the portion of the packet owned by the NIC as suggested by 
>   Jesper

Acked-by: Jesper Dangaard Brouer 

Some general comments below.

>  drivers/net/ethernet/socionext/netsec.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/net/ethernet/socionext/netsec.c 
> b/drivers/net/ethernet/socionext/netsec.c
> index 5544a722543f..6b954ad88842 100644
> --- a/drivers/net/ethernet/socionext/netsec.c
> +++ b/drivers/net/ethernet/socionext/netsec.c
> @@ -727,6 +727,7 @@ static void *netsec_alloc_rx_data(struct netsec_priv 
> *priv,
>  {
>  
>   struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
> + enum dma_data_direction dma_dir;
>   struct page *page;
>  
>   page = page_pool_dev_alloc_pages(dring->page_pool);
> @@ -742,6 +743,8 @@ static void *netsec_alloc_rx_data(struct netsec_priv 
> *priv,
>* cases and reserve enough space for headroom + skb_shared_info
>*/
>   *desc_len = PAGE_SIZE - NETSEC_RX_BUF_NON_DATA;
> + dma_dir = page_pool_get_dma_dir(dring->page_pool);
> + dma_sync_single_for_device(priv->dev, *dma_handle, *desc_len, dma_dir);

Following the API this seems to turn into a noop if dev_is_dma_coherent().

Thus, I don't think it is worth optimizing further, as I suggested
earlier, with only sync of previous packet length.   This sync of the
"full" possible payload-data area (without headroom) is likely the best
and simplest option.  I don't think we should extend and complicate
the API for optimizing for non-coherent DMA hardware.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

[PATCHv2] tools bpftool: Fix json dump crash on powerpc

2019-07-05 Thread Jiri Olsa

On Thu, Jul 04, 2019 at 01:42:10PM -0700, Jakub Kicinski wrote:
> On Thu,  4 Jul 2019 10:58:56 +0200, Jiri Olsa wrote:
> > Michael reported crash with by bpf program in json mode on powerpc:
> > 
> >   # bpftool prog -p dump jited id 14
> >   [{
> > "name": "0xda9aa760",
> > "insns": [{
> > "pc": "0x0",
> > "operation": "nop",
> > "operands": [null
> > ]
> > },{
> > "pc": "0x4",
> > "operation": "nop",
> > "operands": [null
> > ]
> > },{
> > "pc": "0x8",
> > "operation": "mflr",
> >   Segmentation fault (core dumped)
> > 
> > The code is assuming char pointers in format, which is not always
> > true at least for powerpc. Fixing this by dumping the whole string
> > into buffer based on its format.
> > 
> > Please note that libopcodes code does not check return values from
> > fprintf callback, so there's no point to return error in case of
> > allocation failure.
> 
> Well, it doesn't check it today, it may perhaps do it in the future?
> Let's flip the question - since it doesn't check it today, why not
> propagate the error? :)  We should stay close to how fprintf would
> behave, IMHO.
> 
> Fixes: 107f041212c1 ("tools: bpftool: add JSON output for `bpftool prog dump 
> jited *` command")

ok fair enough, v2 attached

thanks,
jirka


---
Michael reported crash with by bpf program in json mode on powerpc:

  # bpftool prog -p dump jited id 14
  [{
"name": "0xda9aa760",
"insns": [{
"pc": "0x0",
"operation": "nop",
"operands": [null
]
},{
"pc": "0x4",
"operation": "nop",
"operands": [null
]
},{
"pc": "0x8",
"operation": "mflr",
  Segmentation fault (core dumped)

The code is assuming char pointers in format, which is not always
true at least for powerpc. Fixing this by dumping the whole string
into buffer based on its format.

Please note that libopcodes code does not check return values from
fprintf callback, but as per Jakub suggestion returning -1 on allocation
failure so we do the best effort to propagate the error. 

Reported-by: Michael Petlan 
Signed-off-by: Jiri Olsa 
---
 tools/bpf/bpftool/jit_disasm.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index 3ef3093560ba..bfed711258ce 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -11,6 +11,8 @@
  * Licensed under the GNU General Public License, version 2.0 (GPLv2)
  */
 
+#define _GNU_SOURCE
+#include 
 #include 
 #include 
 #include 
@@ -44,11 +46,13 @@ static int fprintf_json(void *out, const char *fmt, ...)
char *s;
 
va_start(ap, fmt);
+   if (vasprintf(&s, fmt, ap) < 0)
+   return -1;
+   va_end(ap);
+
if (!oper_count) {
int i;
 
-   s = va_arg(ap, char *);
-
/* Strip trailing spaces */
i = strlen(s) - 1;
while (s[i] == ' ')
@@ -61,11 +65,10 @@ static int fprintf_json(void *out, const char *fmt, ...)
} else if (!strcmp(fmt, ",")) {
   /* Skip */
} else {
-   s = va_arg(ap, char *);
jsonw_string(json_wtr, s);
oper_count++;
}
-   va_end(ap);
+   free(s);
return 0;
 }
 
-- 
2.21.0

Re: i.mx6ul with DSA in multi chip addressing mode - no MDIO access

2019-07-05 Thread Benjamin Beckmeyer

>> &mdio0 {
>> interrupt-parent = <&gpio1>;
>> interrupts = <3 IRQ_TYPE_LEVEL_HIGH>;
>>
>> switch0: switch0@2 {
>> compatible = "marvell,mv88e6190";
>> reg = <2>;
>> pinctrl-0 = <&pinctrl_gpios>;
>> reset-gpios = <&gpio4 16 GPIO_ACTIVE_LOW>;
>> dsa,member = <0 0>;
> This is wrong. The interrupt is a switch property, not an MDIO bus
> property. So it belongs inside the switch node.
>
> Andrew

Hi Andrew,

in the documentation for Marvell DSA the interrupt properties are in 
the MDIO part. Maybe the documentation for device tree is wrong or 
unclear?

I switched to the kernel 5.1.16 to take advantage of your new code.
At the moment I deleted all interrupt properties from my device tree 
and if I get you right now the access should be trigger all 100ms but 
I have accesses within the tracing about 175 times a second.

Here is a snip from my trace without IRQ
2188000.etherne-223   [000]    109.932406: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x01 val:0x40a8
 2188000.etherne-223   [000]    109.932501: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b64
 2188000.etherne-223   [000]    109.933113: mdio_access: 2188000.ethernet-1 
write phy:0x02 reg:0x00 val:0x9b60
 2188000.etherne-223   [000]    109.933261: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 2188000.etherne-223   [000]    109.933359: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x01 val:0xc801
 2188000.etherne-223   [000]    110.041683: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 2188000.etherne-223   [000]    110.041817: mdio_access: 2188000.ethernet-1 
write phy:0x02 reg:0x00 val:0x9b60
 2188000.etherne-223   [000]    110.041919: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 2188000.etherne-223   [000]    110.042025: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x01 val:0xc801

Am I doing it right with the tracing points? I run just

echo 1 > /sys/kernel/debug/tracing/events/mdio/mdio_access/enable
cat /sys/kernel/debug/tracing/trace

Here is the another device tree I tried, but with this I get accesses 
on the bus in about every 50 microseconds!

--snip
&mdio0 {
switch0: switch0@2 {
compatible = "marvell,mv88e6190";
reg = <2>;
pinctrl-0 = <&pinctrl_switch_irq>;
interrupt-parent = <&gpio1>;
interrupts = <3 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
#interrupt-cells = <2>;
dsa,member = <0 0>;

ports {
#address-cells = <1>;
#size-cells = <0>;
--snip

Here is a snip from my trace with IRQ.
irq/54-2188000.-223   [000]    958.940744: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b64
 irq/54-2188000.-223   [000]    958.940800: mdio_access: 2188000.ethernet-1 
write phy:0x02 reg:0x00 val:0x9b60
 irq/54-2188000.-223   [000]    958.940857: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 irq/54-2188000.-223   [000]    958.940914: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x01 val:0xc801
 irq/54-2188000.-223   [000]    958.940984: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 irq/54-2188000.-223   [000]    958.941043: mdio_access: 2188000.ethernet-1 
write phy:0x02 reg:0x00 val:0x9b60
 irq/54-2188000.-223   [000]    958.941100: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 irq/54-2188000.-223   [000]    958.941158: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x01 val:0xc801
 irq/54-2188000.-223   [000]    958.941218: mdio_access: 2188000.ethernet-1 
read  phy:0x02 reg:0x00 val:0x1b60
 irq/54-2188000.-223   [000]    958.941276: mdio_access: 2188000.ethernet-1 
write phy:0x02 reg:0x00 val:0x9b64

Thanks,
Benny

[PATCH net-next] MAINTAINERS: Add page_pool maintainer entry

2019-07-05 Thread Jesper Dangaard Brouer

In this release cycle the number of NIC drivers using page_pool
will likely reach 4 drivers.  It is about time to add a maintainer
entry.  Add myself.

Signed-off-by: Jesper Dangaard Brouer 
---
 MAINTAINERS |7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 449e7cdb3303..1a8e0a01bf03 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11902,6 +11902,13 @@ F: kernel/padata.c
 F: include/linux/padata.h
 F: Documentation/padata.txt
 
+PAGE POOL
+M: Jesper Dangaard Brouer 
+L: netdev@vger.kernel.org
+S: Supported
+F: net/core/page_pool.c
+F: include/net/page_pool.h
+
 PANASONIC LAPTOP ACPI EXTRAS DRIVER
 M: Harald Welte 
 L: platform-driver-...@vger.kernel.org

[PATCH net-next V2] MAINTAINERS: Add page_pool maintainer entry

2019-07-05 Thread Jesper Dangaard Brouer

In this release cycle the number of NIC drivers using page_pool
will likely reach 4 drivers.  It is about time to add a maintainer
entry.  Add myself and Ilias.

Signed-off-by: Jesper Dangaard Brouer 
---
V2: Ilias also volunteered to co-maintain over IRC

 MAINTAINERS |8 
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 449e7cdb3303..22655aa84a46 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11902,6 +11902,14 @@ F: kernel/padata.c
 F: include/linux/padata.h
 F: Documentation/padata.txt
 
+PAGE POOL
+M: Jesper Dangaard Brouer 
+M: Ilias Apalodimas 
+L: netdev@vger.kernel.org
+S: Supported
+F: net/core/page_pool.c
+F: include/net/page_pool.h
+
 PANASONIC LAPTOP ACPI EXTRAS DRIVER
 M: Harald Welte 
 L: platform-driver-...@vger.kernel.org

Re: [PATCH net-next 1/8] Documentation/bindings: net: ocelot: document the PTP bank

2019-07-05 Thread Antoine Tenart

Hi Andrew,

On Mon, Jul 01, 2019 at 03:52:14PM +0200, Andrew Lunn wrote:
> On Mon, Jul 01, 2019 at 12:03:20PM +0200, Antoine Tenart wrote:
> > One additional register range needs to be described within the Ocelot
> > device tree node: the PTP. This patch documents the binding needed to do
> > so.
> 
> Are there any more register banks? Maybe just add them all?

I checked and there are (just a few) more. I also saw your other comment
about interrupts, and it's also true there.

Those definitions aren't related to the PHC so I'll prepare a patch for
a following series to add all the missing parts.

> Also, you should probably add a comment that despite it being in the
> Required part of the binding, it is actually optional.

I'm not sure about this: optional properties means some parts of the h/w
can be missing or not wired. It's not the case here, it's "optional" in
the driver only for dt compatibility (so that an older dt blob can work
with a newer kernel image), but it's now mandatory in the binding.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Request for backport of 96125bf9985a75db00496dd2bc9249b777d2b19b

2019-07-05 Thread Loganaden Velvindron

Hi folks,

I read the guidelines for LTS/stable.
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html


Although this is not a bugfix, I am humbly submitting a request so
that commit id
-- 96125bf9985a75db00496dd2bc9249b777d2b19b Allow 0.0.0.0/8 as a valid
address range --  is backported to all LTS kernels.

My motivation for such a request is that we need this patch to be as
widely deployed as possible and as early as possible for interop and
hopefully move into better utilization of ipv4 addresses space. Hence
my request for it be added to -stable.

Kind regards,
//Logan

Re: NEIGH: BUG, double timer add, state is 8

2019-07-05 Thread David Ahern

On 7/4/19 3:59 PM, Marek Majkowski wrote:
> I found a way to hit an obscure BUG in the
> net/core/neighbour.c:neigh_add_timer(), by piping two carefully
> crafted messages into AF_NETLINK socket.
> 
> https://github.com/torvalds/linux/blob/v5.2-rc7/net/core/neighbour.c#L259
> 
> if (unlikely(mod_timer(&n->timer, when))) {
> printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state);
> dump_stack();
>  }
> 
> The repro is here:
> https://gist.github.com/majek/d70297b9d72bc2e2b82145e122722a0c
> 
> wget 
> https://gist.githubusercontent.com/majek/d70297b9d72bc2e2b82145e122722a0c/raw/9e140bcedecc28d722022f1da142a379a9b7a7b0/double_timer_add_bug.c

Thanks for the report - and the reproducer. I am on PTO through Monday;
I will take a look next week if no one else does.

Re: [net-next 14/14] net/mlx5e: Add kTLS TX HW offload support

2019-07-05 Thread Tariq Toukan



On 7/4/2019 11:12 PM, Jakub Kicinski wrote:
> On Thu, 4 Jul 2019 18:16:15 +, Saeed Mahameed wrote:
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c 
>> b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
>> index 483d321d2151..6854f132d505 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
>> @@ -50,6 +50,15 @@ static const struct counter_desc sw_stats_desc[] = {
>>   #ifdef CONFIG_MLX5_EN_TLS
>>  { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_ooo) },
>>  { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_resync_bytes) },
>> +
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_ooo) },
> 
> Why do you call this stat tx_ktls_ooo, and not tx_tls_ooo (extra 'k')?
> 
> For nfp I used the stats' names from mlx5 FPGA to make sure we are all
> consistent.  I've added them to the tls-offload.rst doc and Boris has
> reviewed it.
> 
>   * ``rx_tls_decrypted`` - number of successfully decrypted TLS segments
>   * ``tx_tls_encrypted`` - number of in-order TLS segments passed to device
> for encryption
>   * ``tx_tls_ooo`` - number of TX packets which were part of a TLS stream
> but did not arrive in the expected order
>   * ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because
> they arrived out of order and associated record could not be found
> 
> Why can't you use the same names for the stats as you used for your mlx5
> FPGA?
> 

Agree. Fixing.

What about having stats both for packets and bytes?
tx_tls_encrypted_packets
tx_tls_encrypted_bytes

>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, 
>> tx_ktls_ooo_drop_no_sync_data) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, 
>> tx_ktls_ooo_drop_bypass_req) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_ooo_dump_bytes) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_ooo_dump_packets) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_enc_packets) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_enc_bytes) },
>> +{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_ktls_ctx) },
>>   #endif
>>   
>>  { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_packets) },
> 
> Dave, please don't apply this, I will review in depth once I get
> through the earlier 200 emails ;)
>

Re: [PATCH bpf-next v3] libbpf: add xsk_ring_prod__nb_free() function

2019-07-05 Thread Daniel Borkmann

On 07/03/2019 02:52 PM, Eelco Chaudron wrote:
> When an AF_XDP application received X packets, it does not mean X
> frames can be stuffed into the producer ring. To make it easier for
> AF_XDP applications this API allows them to check how many frames can
> be added into the ring.
> 
> Signed-off-by: Eelco Chaudron 

The commit log as it is along with the code is a bit too confusing for
readers. After all you only do a rename below. It would need to additionally
state that the rename is as per libbpf convention (xyz__ prefix) in order to
denote that this API is exposed to be used by applications.

Given you are doing this for xsk_prod_nb_free(), should we do the same for
xsk_cons_nb_avail() as well? Extending XDP sample app would be reasonable
addition as well in this context.

> ---
> 
> v2 -> v3
>  - Removed cache by pass option
> 
> v1 -> v2
>  - Renamed xsk_ring_prod__free() to xsk_ring_prod__nb_free()
>  - Add caching so it will only touch global state when needed
> 
>  tools/lib/bpf/xsk.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
> index 82ea71a0f3ec..3411556e04d9 100644
> --- a/tools/lib/bpf/xsk.h
> +++ b/tools/lib/bpf/xsk.h
> @@ -76,7 +76,7 @@ xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, 
> __u32 idx)
>   return &descs[idx & rx->mask];
>  }
>  
> -static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
> +static inline __u32 xsk_prod__nb_free(struct xsk_ring_prod *r, __u32 nb)
>  {
>   __u32 free_entries = r->cached_cons - r->cached_prod;
>  
> @@ -110,7 +110,7 @@ static inline __u32 xsk_cons_nb_avail(struct 
> xsk_ring_cons *r, __u32 nb)
>  static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
>   size_t nb, __u32 *idx)
>  {
> - if (xsk_prod_nb_free(prod, nb) < nb)
> + if (xsk_prod__nb_free(prod, nb) < nb)
>   return 0;
>  
>   *idx = prod->cached_prod;
>

Re: i.mx6ul with DSA in multi chip addressing mode - no MDIO access

2019-07-05 Thread Andrew Lunn

On Fri, Jul 05, 2019 at 02:41:43PM +0200, Benjamin Beckmeyer wrote:
> >> &mdio0 {
> >> interrupt-parent = <&gpio1>;
> >> interrupts = <3 IRQ_TYPE_LEVEL_HIGH>;
> >>
> >> switch0: switch0@2 {
> >> compatible = "marvell,mv88e6190";
> >> reg = <2>;
> >> pinctrl-0 = <&pinctrl_gpios>;
> >> reset-gpios = <&gpio4 16 GPIO_ACTIVE_LOW>;
> >> dsa,member = <0 0>;
> > This is wrong. The interrupt is a switch property, not an MDIO bus
> > property. So it belongs inside the switch node.
> >
> >   Andrew
> 
> Hi Andrew,
> 
> in the documentation for Marvell DSA the interrupt properties are in 
> the MDIO part. Maybe the documentation for device tree is wrong or 
> unclear?

Ah. Yes. The documentation is wrong. I will fix that.

> 
> I switched to the kernel 5.1.16 to take advantage of your new code.
> At the moment I deleted all interrupt properties from my device tree 
> and if I get you right now the access should be trigger all 100ms but 
> I have accesses within the tracing about 175 times a second.
> 
> Here is a snip from my trace without IRQ
> 2188000.etherne-223   [000]    109.932406: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x01 val:0x40a8
>  2188000.etherne-223   [000]    109.932501: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x00 val:0x1b64
>  2188000.etherne-223   [000]    109.933113: mdio_access: 
> 2188000.ethernet-1 write phy:0x02 reg:0x00 val:0x9b60
>  2188000.etherne-223   [000]    109.933261: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x00 val:0x1b60
>  2188000.etherne-223   [000]    109.933359: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x01 val:0xc801

>  2188000.etherne-223   [000]    110.041683: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x00 val:0x1b60
>  2188000.etherne-223   [000]    110.041817: mdio_access: 
> 2188000.ethernet-1 write phy:0x02 reg:0x00 val:0x9b60
>  2188000.etherne-223   [000]    110.041919: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x00 val:0x1b60
>  2188000.etherne-223   [000]    110.042025: mdio_access: 
> 2188000.ethernet-1 read  phy:0x02 reg:0x01 val:0xc801

These four access are one switch register access. The first read will
be checking that the busy bit is not set. The second sets up a read to
switch register 0x00 device address 1b, i.e. global 1. So this is the
interrupt status register. The third read is checking that the busy
bit is cleared. And the last is the actual value of the register.

> 
> Am I doing it right with the tracing points? I run just
> 
> echo 1 > /sys/kernel/debug/tracing/events/mdio/mdio_access/enable
> cat /sys/kernel/debug/tracing/trace

That looks correct.

I think you are going to have to parse the register writes/reads to
figure out what switch registers it is accessing. That should
hopefully make it clearer why it is making so many accesses.

> Here is the another device tree I tried, but with this I get accesses 
> on the bus in about every 50 microseconds!
> 
> --snip
> &mdio0 {
> switch0: switch0@2 {
> compatible = "marvell,mv88e6190";
> reg = <2>;
> pinctrl-0 = <&pinctrl_switch_irq>;
> interrupt-parent = <&gpio1>;
> interrupts = <3 IRQ_TYPE_LEVEL_LOW>;
> interrupt-controller;
> #interrupt-cells = <2>;
> dsa,member = <0 0>;
> 
> ports {
> #address-cells = <1>;
> #size-cells = <0>;
> --snip

That looks sensible.

 Andrew

Re: [PATCH net-next 1/8] Documentation/bindings: net: ocelot: document the PTP bank

2019-07-05 Thread Andrew Lunn

On Fri, Jul 05, 2019 at 03:30:16PM +0200, Antoine Tenart wrote:
> Hi Andrew,
> 
> On Mon, Jul 01, 2019 at 03:52:14PM +0200, Andrew Lunn wrote:
> > On Mon, Jul 01, 2019 at 12:03:20PM +0200, Antoine Tenart wrote:
> > > One additional register range needs to be described within the Ocelot
> > > device tree node: the PTP. This patch documents the binding needed to do
> > > so.
> > 
> > Are there any more register banks? Maybe just add them all?
> 
> I checked and there are (just a few) more. I also saw your other comment
> about interrupts, and it's also true there.
> 
> Those definitions aren't related to the PHC so I'll prepare a patch for
> a following series to add all the missing parts.

Thanks.
 
> > Also, you should probably add a comment that despite it being in the
> > Required part of the binding, it is actually optional.
> 
> I'm not sure about this: optional properties means some parts of the h/w
> can be missing or not wired. It's not the case here, it's "optional" in
> the driver only for dt compatibility (so that an older dt blob can work
> with a newer kernel image), but it's now mandatory in the binding.

Hi Antoine

If the driver can work without it, it is clearly optional. You just
get reduced functionality. That is the thing with DT. You can never
add more required properties after the first commit without breaking
backwards compatibility. To make the documentation fit the driver,
somewhere you need to state they are optional. Either by placing the
new properties in the optional section of the binding, or add a
comment.

Andrew

Re: [PATCH net-next v6 5/5] selftests: tc-tests: actions: add MPLS tests

2019-07-05 Thread Roman Mashak

John Hurley  writes:

> Add a new series of selftests to verify the functionality of act_mpls in
> TC.
>
> Signed-off-by: John Hurley 
> Reviewed-by: Simon Horman 
> Acked-by: Jakub Kicinski 
> ---
>  tools/testing/selftests/tc-testing/config  |   1 +
>  .../tc-testing/tc-tests/actions/mpls.json  | 812 
> +
>  2 files changed, 813 insertions(+)
>  create mode 100644 
> tools/testing/selftests/tc-testing/tc-tests/actions/mpls.json
>

[...]

Thanks for contributing tdc test cases. It would make sense to add tests
for max values and exceeding max allowed values, e.g. for mpls labels,
ttl and such, as we already do for other actions.

[PATCH net-next 11/12] net/mlx5e: Introduce a fenced NOP WQE posting function

2019-07-05 Thread Tariq Toukan

Similar to the existing mlx5e_post_nop(), but marks a fence
in the WQE control segment.

Added as a separate new function to not hurt the performance
of the common case.

To be used in a downstream patch of the series.

Signed-off-by: Tariq Toukan 
Reviewed-by: Boris Pismenny 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index af6aec717d4e..ef16f9e41cf4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -58,6 +58,24 @@
return wqe;
 }
 
+static inline struct mlx5e_tx_wqe *
+mlx5e_post_nop_fence(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
+{
+   u16 pi   = mlx5_wq_cyc_ctr2ix(wq, *pc);
+   struct mlx5e_tx_wqe*wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+   struct mlx5_wqe_ctrl_seg   *cseg = &wqe->ctrl;
+
+   memset(cseg, 0, sizeof(*cseg));
+
+   cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP);
+   cseg->qpn_ds   = cpu_to_be32((sqn << 8) | 0x01);
+   cseg->fm_ce_se = MLX5_FENCE_MODE_INITIATOR_SMALL;
+
+   (*pc)++;
+
+   return wqe;
+}
+
 static inline void
 mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq, struct mlx5_wq_cyc *wq,
u16 pi, u16 nnops)
-- 
1.8.3.1

[PATCH net-next 04/12] net/mlx5: Accel, Add core TLS support for the Connect-X family

2019-07-05 Thread Tariq Toukan

Add support for the new TLS implementation of the Connect-X family.
Introduce a new compilation flag MLX5_TLS for it.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig| 13 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c| 42 ++-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h| 49 +-
 3 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 6556490d809c..37fef8cd25e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -134,10 +134,21 @@ config MLX5_FPGA_TLS
mlx5_core driver will include the Innova FPGA core and allow building
sandbox-specific client drivers.
 
+config MLX5_TLS
+   bool "Mellanox Technologies TLS Connect-X support"
+   depends on MLX5_CORE_EN
+   depends on TLS_DEVICE
+   depends on TLS=y || MLX5_CORE=m
+   select MLX5_ACCEL
+   default n
+   help
+   Build TLS support for the Connect-X family of network cards by Mellanox
+   Technologies.
+
 config MLX5_EN_TLS
bool "TLS cryptography-offload accelaration"
depends on MLX5_CORE_EN
-   depends on MLX5_FPGA_TLS
+   depends on MLX5_FPGA_TLS || MLX5_TLS
default y
help
Build support for TLS cryptography-offload accelaration in the NIC.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c 
b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
index a2c9eda1ebf5..cab708af3422 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -35,6 +35,7 @@
 
 #include "accel/tls.h"
 #include "mlx5_core.h"
+#include "lib/mlx5.h"
 
 #ifdef CONFIG_MLX5_FPGA_TLS
 #include "fpga/tls.h"
@@ -63,7 +64,8 @@ int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 
handle, u32 seq,
 
 bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev)
 {
-   return mlx5_fpga_is_tls_device(mdev);
+   return mlx5_fpga_is_tls_device(mdev) ||
+   mlx5_accel_is_ktls_device(mdev);
 }
 
 u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev)
@@ -81,3 +83,41 @@ void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
mlx5_fpga_tls_cleanup(mdev);
 }
 #endif
+
+#ifdef CONFIG_MLX5_TLS
+int mlx5_ktls_create_key(struct mlx5_core_dev *mdev,
+struct tls_crypto_info *crypto_info,
+u32 *p_key_id)
+{
+   u32 sz_bytes;
+   void *key;
+
+   switch (crypto_info->cipher_type) {
+   case TLS_CIPHER_AES_GCM_128: {
+   struct tls12_crypto_info_aes_gcm_128 *info =
+   (struct tls12_crypto_info_aes_gcm_128 *)crypto_info;
+
+   key  = info->key;
+   sz_bytes = sizeof(info->key);
+   break;
+   }
+   case TLS_CIPHER_AES_GCM_256: {
+   struct tls12_crypto_info_aes_gcm_256 *info =
+   (struct tls12_crypto_info_aes_gcm_256 *)crypto_info;
+
+   key  = info->key;
+   sz_bytes = sizeof(info->key);
+   break;
+   }
+   default:
+   return -EINVAL;
+   }
+
+   return mlx5_create_encryption_key(mdev, key, sz_bytes, p_key_id);
+}
+
+void mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id)
+{
+   mlx5_destroy_encryption_key(mdev, key_id);
+}
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h 
b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
index e5d306ad7f91..879321b21616 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -37,6 +37,50 @@
 #include 
 #include 
 
+#ifdef CONFIG_MLX5_TLS
+int mlx5_ktls_create_key(struct mlx5_core_dev *mdev,
+struct tls_crypto_info *crypto_info,
+u32 *p_key_id);
+void mlx5_ktls_destroy_key(struct mlx5_core_dev *mdev, u32 key_id);
+
+static inline bool mlx5_accel_is_ktls_device(struct mlx5_core_dev *mdev)
+{
+   if (!MLX5_CAP_GEN(mdev, tls))
+   return false;
+
+   if (!MLX5_CAP_GEN(mdev, log_max_dek))
+   return false;
+
+   return MLX5_CAP_TLS(mdev, tls_1_2_aes_gcm_128);
+}
+
+static inline bool mlx5e_ktls_type_check(struct mlx5_core_dev *mdev,
+struct tls_crypto_info *crypto_info)
+{
+   switch (crypto_info->cipher_type) {
+   case TLS_CIPHER_AES_GCM_128:
+   if (crypto_info->version == TLS_1_2_VERSION)
+   return MLX5_CAP_TLS(mdev,  tls_1_2_aes_gcm_128);
+   break;
+   }
+
+   return false;
+}
+#else
+static inline int
+mlx5_ktls_create_key(struct mlx5_core_dev *mdev,
+struct tls_crypto_info *cry

[PATCH net-next 03/12] net/mlx5: Add crypto library to support create/destroy encryption key

2019-07-05 Thread Tariq Toukan

Encryption key create / destroy is done via
CREATE_GENERAL_OBJECT / DESTROY_GENERAL_OBJECT commands.

To be used in downstream patches by TLS API wrappers, to configure
the TIS context with the encryption key.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/crypto.c   | 72 ++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |  5 ++
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index d3409870646a..5a1ee9ec8659 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -55,7 +55,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o 
ipoib/ethtool.o ipoib/ipoib
 #
 mlx5_core-$(CONFIG_MLX5_FPGA_IPSEC) += fpga/ipsec.o
 mlx5_core-$(CONFIG_MLX5_FPGA_TLS)   += fpga/tls.o
-mlx5_core-$(CONFIG_MLX5_ACCEL)  += accel/tls.o accel/ipsec.o
+mlx5_core-$(CONFIG_MLX5_ACCEL)  += lib/crypto.o accel/tls.o accel/ipsec.o
 
 mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
new file mode 100644
index ..ea9ee88491e5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2019 Mellanox Technologies.
+
+#include "mlx5_core.h"
+
+int mlx5_create_encryption_key(struct mlx5_core_dev *mdev,
+  void *key, u32 sz_bytes,
+  u32 *p_key_id)
+{
+   u32 in[MLX5_ST_SZ_DW(create_encryption_key_in)] = {};
+   u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+   u32 sz_bits = sz_bytes * BITS_PER_BYTE;
+   u8  general_obj_key_size;
+   u64 general_obj_types;
+   void *obj, *key_p;
+   int err;
+
+   obj = MLX5_ADDR_OF(create_encryption_key_in, in, encryption_key_object);
+   key_p = MLX5_ADDR_OF(encryption_key_obj, obj, key);
+
+   general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types);
+   if (!(general_obj_types &
+ MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY))
+   return -EINVAL;
+
+   switch (sz_bits) {
+   case 128:
+   general_obj_key_size =
+   MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128;
+   break;
+   case 256:
+   general_obj_key_size =
+   MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   memcpy(key_p, key, sz_bytes);
+
+   MLX5_SET(encryption_key_obj, obj, key_size, general_obj_key_size);
+   MLX5_SET(encryption_key_obj, obj, key_type,
+MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_DEK);
+   MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
+MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+   MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
+MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY);
+   MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.pdn);
+
+   err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+   if (!err)
+   *p_key_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+   /* avoid leaking key on the stack */
+   memzero_explicit(in, sizeof(in));
+
+   return err;
+}
+
+void mlx5_destroy_encryption_key(struct mlx5_core_dev *mdev, u32 key_id)
+{
+   u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+   u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+
+   MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
+MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+   MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
+MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY);
+   MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, key_id);
+
+   mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h 
b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index d918e44491f4..b99d469e4e64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -79,4 +79,9 @@ struct mlx5_pme_stats {
 void mlx5_get_pme_stats(struct mlx5_core_dev *dev, struct mlx5_pme_stats 
*stats);
 int mlx5_notifier_call_chain(struct mlx5_events *events, unsigned int event, 
void *data);
 
+/* Crypto */
+int mlx5_create_encryption_key(struct mlx5_core_dev *mdev,
+  void *key, u32 sz_bytes, u32 *p_key_id);
+void mlx5_destroy_encryption_key(struct mlx5_core_dev *mdev, u32 key_id);
+
 #endif
-- 
1.

[PATCH net-next 10/12] net/mlx5e: Re-work TIS creation functions

2019-07-05 Thread Tariq Toukan

Let the EN TIS creation function (mlx5e_create_tis) be responsible
for applying common mdev related fields.
Other specific fields must be set by the caller and passed within
the inbox.

Signed-off-by: Tariq Toukan 
Reviewed-by: Boris Pismenny 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 17 ++---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c   | 14 +-
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h   |  2 ++
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c  |  2 +-
 5 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 09c43c9f3b4a..d3d2733917ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1053,8 +1053,7 @@ int mlx5e_open_drop_rq(struct mlx5e_priv *priv,
 void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir 
*tirs);
 void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
 
-int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
-u32 underlay_qpn, u32 *tisn);
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn);
 void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn);
 
 int mlx5e_create_tises(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index edbedb1c85f8..075496de00e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3150,20 +3150,16 @@ void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq)
mlx5e_free_cq(&drop_rq->cq);
 }
 
-int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
-u32 underlay_qpn, u32 *tisn)
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn)
 {
-   u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
-   MLX5_SET(tisc, tisc, prio, tc << 1);
-   MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn);
MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
 
if (mlx5_lag_is_lacp_owner(mdev))
MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
 
-   return mlx5_core_create_tis(mdev, in, sizeof(in), tisn);
+   return mlx5_core_create_tis(mdev, in, MLX5_ST_SZ_BYTES(create_tis_in), 
tisn);
 }
 
 void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn)
@@ -3177,7 +3173,14 @@ int mlx5e_create_tises(struct mlx5e_priv *priv)
int tc;
 
for (tc = 0; tc < priv->profile->max_tc; tc++) {
-   err = mlx5e_create_tis(priv->mdev, tc, 0, &priv->tisn[tc]);
+   u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+   void *tisc;
+
+   tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+   MLX5_SET(tisc, tisc, prio, tc << 1);
+
+   err = mlx5e_create_tis(priv->mdev, in, &priv->tisn[tc]);
if (err)
goto err_close_tises;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 00e66c3772cc..faf197d53743 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -258,6 +258,18 @@ void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, 
struct mlx5_core_qp *
mlx5_core_destroy_qp(mdev, qp);
 }
 
+int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn)
+{
+   u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+   void *tisc;
+
+   tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+   MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn);
+
+   return mlx5e_create_tis(mdev, in, tisn);
+}
+
 static int mlx5i_init_tx(struct mlx5e_priv *priv)
 {
struct mlx5i_priv *ipriv = priv->ppriv;
@@ -269,7 +281,7 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv)
return err;
}
 
-   err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, 
&priv->tisn[0]);
+   err = mlx5i_create_tis(priv->mdev, ipriv->qp.qpn, &priv->tisn[0]);
if (err) {
mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
goto err_destroy_underlay_qp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index e19ba3fcd1b7..c87962cab921 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -59,6 +59,8 @@ struct mlx5i_priv {
char  *mlx5e_priv[0];
 };
 
+int mlx5i_create_tis(struct mlx5_core_dev *mdev, u32 underlay_qpn, u32 *tisn);
+
 /* Underlay QP create/destroy functions */
 int mlx5i_cr

[PATCH net-next 01/12] net/mlx5: Accel, Expose accel wrapper for IPsec FPGA function

2019-07-05 Thread Tariq Toukan

Do not directly call fpga version of IPsec function from main.c.
Wrap it by an accel version, and call the wrapper.

This will allow deprecating the FPGA IPsec stubs in downstream
patch.

Signed-off-by: Tariq Toukan 
Reviewed-by: Boris Pismenny 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c | 5 +
 drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h | 5 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c| 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c 
b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
index 9f1b1939716a..d1e76d5a413b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -74,6 +74,11 @@ int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev)
return mlx5_fpga_ipsec_init(mdev);
 }
 
+void mlx5_accel_ipsec_build_fs_cmds(void)
+{
+   mlx5_fpga_ipsec_build_fs_cmds();
+}
+
 void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
 {
mlx5_fpga_ipsec_cleanup(mdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h 
b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
index 024dbd22a89b..93b3f5faddb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
@@ -54,6 +54,7 @@ void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev 
*mdev,
 void mlx5_accel_esp_free_hw_context(void *context);
 
 int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_ipsec_build_fs_cmds(void);
 void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev);
 
 #else
@@ -79,6 +80,10 @@ static inline int mlx5_accel_ipsec_init(struct mlx5_core_dev 
*mdev)
return 0;
 }
 
+static inline void mlx5_accel_ipsec_build_fs_cmds(void)
+{
+}
+
 static inline void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
 {
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 4084c4e74fb7..b15b27a497fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1600,7 +1600,7 @@ static int __init init(void)
get_random_bytes(&sw_owner_id, sizeof(sw_owner_id));
 
mlx5_core_verify_params();
-   mlx5_fpga_ipsec_build_fs_cmds();
+   mlx5_accel_ipsec_build_fs_cmds();
mlx5_register_debugfs();
 
err = pci_register_driver(&mlx5_core_driver);
-- 
1.8.3.1

[PATCH net-next 06/12] net/mlx5e: Tx, Enforce L4 inline copy when needed

2019-07-05 Thread Tariq Toukan

When ctrl->tisn field exists, this indicates an operation (HW offload)
on the TCP payload.
For such WQEs, inline the headers up to L4.

This is in preparation for kTLS HW offload support, added in
a downstream patch.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 5 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 5 -
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 7fdf69e08d58..bd41f89afef1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -77,6 +77,11 @@ static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
mlx5_write64((__be32 *)ctrl, uar_map);
 }
 
+static inline bool mlx5e_transport_inline_tx_wqe(struct mlx5e_tx_wqe *wqe)
+{
+   return !!wqe->ctrl.tisn;
+}
+
 static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
 {
struct mlx5_core_cq *mcq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index dc77fe9ae367..b1a163e66053 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -304,9 +304,12 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct 
sk_buff *skb,
num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
stats->packets += skb_shinfo(skb)->gso_segs;
} else {
+   u8 mode = mlx5e_transport_inline_tx_wqe(wqe) ?
+   MLX5_INLINE_MODE_TCP_UDP : sq->min_inline_mode;
+
opcode= MLX5_OPCODE_SEND;
mss   = 0;
-   ihs   = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+   ihs   = mlx5e_calc_min_inline(mode, skb);
num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
stats->packets++;
}
-- 
1.8.3.1

[PATCH net-next 09/12] net/mlx5e: Tx, Unconstify SQ stop room

2019-07-05 Thread Tariq Toukan

Use an SQ field for stop_room, and use the larger value only if TLS
is supported.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 14 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  5 -
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 18 ++
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6e31b7c07f8e..09c43c9f3b4a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -390,6 +390,7 @@ struct mlx5e_txqsq {
void __iomem  *uar_map;
struct netdev_queue   *txq;
u32sqn;
+   u16stop_room;
u8 min_inline_mode;
struct device *pdev;
__be32 mkey_be;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 1280f4163b53..af6aec717d4e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -6,6 +6,20 @@
 
 #include "en.h"
 
+#define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
+#define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
+   MLX5E_SQ_NOPS_ROOM)
+
+#ifndef CONFIG_MLX5_EN_TLS
+#define MLX5E_SQ_TLS_ROOM (0)
+#else
+/* TLS offload requires additional stop_room for:
+ *  - a resync SKB.
+ */
+#define MLX5E_SQ_TLS_ROOM  \
+   (MLX5_SEND_WQE_MAX_WQEBBS)
+#endif
+
 #define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg 
*)NULL)->inline_hdr.start))
 
 static inline bool
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0913be65a862..edbedb1c85f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1126,11 +1126,14 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
sq->uar_map   = mdev->mlx5e_res.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->stats = &c->priv->channel_stats[c->ix].sq[tc];
+   sq->stop_room = MLX5E_SQ_STOP_ROOM;
INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
if (MLX5_IPSEC_DEV(c->priv->mdev))
set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
-   if (mlx5_accel_is_tls_device(c->priv->mdev))
+   if (mlx5_accel_is_tls_device(c->priv->mdev)) {
set_bit(MLX5E_SQ_STATE_TLS, &sq->state);
+   sq->stop_room += MLX5E_SQ_TLS_ROOM;
+   }
 
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 9740ca51921d..200301d6bac5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -40,19 +40,6 @@
 #include "en_accel/en_accel.h"
 #include "lib/clock.h"
 
-#define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
-
-#ifndef CONFIG_MLX5_EN_TLS
-#define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
-   MLX5E_SQ_NOPS_ROOM)
-#else
-/* TLS offload requires MLX5E_SQ_STOP_ROOM to have
- * enough room for a resync SKB, a normal SKB and a NOP
- */
-#define MLX5E_SQ_STOP_ROOM (2 * MLX5_SEND_WQE_MAX_WQEBBS +\
-   MLX5E_SQ_NOPS_ROOM)
-#endif
-
 static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma)
 {
int i;
@@ -267,7 +254,7 @@ static inline void mlx5e_insert_vlan(void *start, struct 
sk_buff *skb, u16 ihs)
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
 
sq->pc += wi->num_wqebbs;
-   if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 
MLX5E_SQ_STOP_ROOM))) {
+   if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 
sq->stop_room))) {
netif_tx_stop_queue(sq->txq);
sq->stats->stopped++;
}
@@ -528,8 +515,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
netdev_tx_completed_queue(sq->txq, npkts, nbytes);
 
if (netif_tx_queue_stopped(sq->txq) &&
-   mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
-  MLX5E_SQ_STOP_ROOM) &&
+   mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room) &&
!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
netif_tx_wake_queue(sq->txq);
stats->wake++;
-- 
1.8.3.1

[PATCH net-next 08/12] net/mlx5e: Tx, Don't implicitly assume SKB-less wqe has one WQEBB

2019-07-05 Thread Tariq Toukan

From: Eran Ben Elisha 

When polling a CQE of an SKB-less WQE, don't assume it consumed only
one WQEBB. Use wi->num_wqebbs directly instead.
In the downstream patch, SKB-less WQEs might have more the one WQEBB,
thus this change is needed.

Signed-off-by: Eran Ben Elisha 
Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 983ea6206a94..9740ca51921d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -485,8 +485,8 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
wi = &sq->db.wqe_info[ci];
skb = wi->skb;
 
-   if (unlikely(!skb)) { /* nop */
-   sqcc++;
+   if (unlikely(!skb)) {
+   sqcc += wi->num_wqebbs;
continue;
}
 
-- 
1.8.3.1

[PATCH net-next 02/12] net/mlx5: Kconfig, Better organize compilation flags

2019-07-05 Thread Tariq Toukan

Always contain all acceleration functions declarations in
'accel' files, independent to the flags setting.
For this, introduce new flags CONFIG_FPGA_{IPSEC/TLS} and use stubs
where needed.

This obsoletes the need for stubs in 'fpga' files. Remove them.

Also use the new flags in Makefile, to decide whether to compile
TLS-specific or IPSEC-specific objects, or not.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig| 43 ++---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  7 +-
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c  |  4 ++
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h  |  2 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c|  3 +
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h|  4 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.h   | 75 --
 include/linux/mlx5/accel.h |  2 +-
 8 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig 
b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 7845aa5bf6be..6556490d809c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -97,26 +97,49 @@ config MLX5_CORE_IPOIB
---help---
  MLX5 IPoIB offloads & acceleration support.
 
+config MLX5_FPGA_IPSEC
+   bool "Mellanox Technologies IPsec Innova support"
+   depends on MLX5_CORE
+   depends on MLX5_FPGA
+   default n
+   help
+   Build IPsec support for the Innova family of network cards by Mellanox
+   Technologies. Innova network cards are comprised of a ConnectX chip
+   and an FPGA chip on one board. If you select this option, the
+   mlx5_core driver will include the Innova FPGA core and allow building
+   sandbox-specific client drivers.
+
 config MLX5_EN_IPSEC
bool "IPSec XFRM cryptography-offload accelaration"
-   depends on MLX5_ACCEL
depends on MLX5_CORE_EN
depends on XFRM_OFFLOAD
depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD
+   depends on MLX5_FPGA_IPSEC
default n
-   ---help---
+   help
  Build support for IPsec cryptography-offload accelaration in the NIC.
  Note: Support for hardware with this capability needs to be selected
  for this option to become available.
 
-config MLX5_EN_TLS
-   bool "TLS cryptography-offload accelaration"
-   depends on MLX5_CORE_EN
+config MLX5_FPGA_TLS
+   bool "Mellanox Technologies TLS Innova support"
depends on TLS_DEVICE
depends on TLS=y || MLX5_CORE=m
-   depends on MLX5_ACCEL
+   depends on MLX5_FPGA
default n
-   ---help---
- Build support for TLS cryptography-offload accelaration in the NIC.
- Note: Support for hardware with this capability needs to be selected
- for this option to become available.
+   help
+   Build TLS support for the Innova family of network cards by Mellanox
+   Technologies. Innova network cards are comprised of a ConnectX chip
+   and an FPGA chip on one board. If you select this option, the
+   mlx5_core driver will include the Innova FPGA core and allow building
+   sandbox-specific client drivers.
+
+config MLX5_EN_TLS
+   bool "TLS cryptography-offload accelaration"
+   depends on MLX5_CORE_EN
+   depends on MLX5_FPGA_TLS
+   default y
+   help
+   Build support for TLS cryptography-offload accelaration in the NIC.
+   Note: Support for hardware with this capability needs to be selected
+   for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8456b19d79cd..d3409870646a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -53,10 +53,11 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o 
ipoib/ethtool.o ipoib/ipoib
 #
 # Accelerations & FPGA
 #
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA_IPSEC) += fpga/ipsec.o
+mlx5_core-$(CONFIG_MLX5_FPGA_TLS)   += fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_ACCEL)  += accel/tls.o accel/ipsec.o
 
-mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o 
\
-fpga/ipsec.o fpga/tls.o
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 en_accel/ipsec_stats.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c 
b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
index d1e76d5a413b..eddc34e4a762 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -31,6 +31,8 @@
  *
  */
 
+#

[PATCH net-next 05/12] net/mlx5e: Move helper functions to a new txrx datapath header

2019-07-05 Thread Tariq Toukan

Take datapath helper functions to a new header file en/txrx.h.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 102 -
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h  | 163 +
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h   |   1 +
 .../mellanox/mlx5/core/en_accel/en_accel.h |   1 +
 .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h   |   1 +
 .../mellanox/mlx5/core/en_accel/tls_rxtx.h |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c|  52 +--
 8 files changed, 170 insertions(+), 153 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cd5afc6ef50b..6e31b7c07f8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -549,12 +549,6 @@ struct mlx5e_icosq {
struct mlx5e_channel  *channel;
 } cacheline_aligned_in_smp;
 
-static inline bool
-mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n)
-{
-   return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
-}
-
 struct mlx5e_wqe_frag_info {
struct mlx5e_dma_info *di;
u32 offset;
@@ -1023,102 +1017,6 @@ static inline bool mlx5_tx_swp_supported(struct 
mlx5_core_dev *mdev)
MLX5_CAP_ETH(mdev, swp_csum) && MLX5_CAP_ETH(mdev, swp_lso);
 }
 
-struct mlx5e_swp_spec {
-   __be16 l3_proto;
-   u8 l4_proto;
-   u8 is_tun;
-   __be16 tun_l3_proto;
-   u8 tun_l4_proto;
-};
-
-static inline void
-mlx5e_set_eseg_swp(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg,
-  struct mlx5e_swp_spec *swp_spec)
-{
-   /* SWP offsets are in 2-bytes words */
-   eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2;
-   if (swp_spec->l3_proto == htons(ETH_P_IPV6))
-   eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6;
-   if (swp_spec->l4_proto) {
-   eseg->swp_outer_l4_offset = skb_transport_offset(skb) / 2;
-   if (swp_spec->l4_proto == IPPROTO_UDP)
-   eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP;
-   }
-
-   if (swp_spec->is_tun) {
-   eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2;
-   if (swp_spec->tun_l3_proto == htons(ETH_P_IPV6))
-   eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
-   } else { /* typically for ipsec when xfrm mode != XFRM_MODE_TUNNEL */
-   eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2;
-   if (swp_spec->l3_proto == htons(ETH_P_IPV6))
-   eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
-   }
-   switch (swp_spec->tun_l4_proto) {
-   case IPPROTO_UDP:
-   eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP;
-   /* fall through */
-   case IPPROTO_TCP:
-   eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2;
-   break;
-   }
-}
-
-static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
- struct mlx5e_tx_wqe **wqe,
- u16 *pi)
-{
-   struct mlx5_wq_cyc *wq = &sq->wq;
-
-   *pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
-   *wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
-   memset(*wqe, 0, sizeof(**wqe));
-}
-
-static inline
-struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
-{
-   u16 pi   = mlx5_wq_cyc_ctr2ix(wq, *pc);
-   struct mlx5e_tx_wqe*wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
-   struct mlx5_wqe_ctrl_seg   *cseg = &wqe->ctrl;
-
-   memset(cseg, 0, sizeof(*cseg));
-
-   cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP);
-   cseg->qpn_ds   = cpu_to_be32((sqn << 8) | 0x01);
-
-   (*pc)++;
-
-   return wqe;
-}
-
-static inline
-void mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc,
-void __iomem *uar_map,
-struct mlx5_wqe_ctrl_seg *ctrl)
-{
-   ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-   /* ensure wqe is visible to device before updating doorbell record */
-   dma_wmb();
-
-   *wq->db = cpu_to_be32(pc);
-
-   /* ensure doorbell record is visible to device before ringing the
-* doorbell
-*/
-   wmb();
-
-   mlx5_write64((__be32 *)ctrl, uar_map);
-}
-
-static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
-{
-   struct mlx5_core_cq *mcq;
-
-   mcq = &cq->mcq;
-   mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
-}
-
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;
diff --git

[PATCH net-next 07/12] net/mlx5e: Tx, Make SQ WQE fetch function type generic

2019-07-05 Thread Tariq Toukan

Change mlx5e_sq_fetch_wqe to be agnostic to the Work Queue
Element (WQE) type.
Before this patch, it was specific for struct mlx5e_tx_wqe.

In order to allow the change, the function now returns the
generic void pointer, and gets the WQE size to do the zero
memset.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h   | 12 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index bd41f89afef1..1280f4163b53 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -14,15 +14,17 @@
return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
 }
 
-static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
- struct mlx5e_tx_wqe **wqe,
- u16 *pi)
+static inline void *
+mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq, size_t size, u16 *pi)
 {
struct mlx5_wq_cyc *wq = &sq->wq;
+   void *wqe;
 
*pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
-   *wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
-   memset(*wqe, 0, sizeof(**wqe));
+   wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
+   memset(wqe, 0, size);
+
+   return wqe;
 }
 
 static inline struct mlx5e_tx_wqe *
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
index 439bf5953885..7d191d98ac94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -248,7 +248,7 @@ static void mlx5e_tls_complete_sync_skb(struct sk_buff *skb,
mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln,
cpu_to_be64(info.rcd_sn));
mlx5e_sq_xmit(sq, nskb, *wqe, *pi, true);
-   mlx5e_sq_fetch_wqe(sq, wqe, pi);
+   *wqe = mlx5e_sq_fetch_wqe(sq, sizeof(**wqe), pi);
return skb;
 
 err_out:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index b1a163e66053..983ea6206a94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -335,7 +335,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct 
sk_buff *skb,
struct mlx5_wqe_eth_seg cur_eth = wqe->eth;
 #endif
mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room);
-   mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
+   wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi);
 #ifdef CONFIG_MLX5_EN_IPSEC
wqe->eth = cur_eth;
 #endif
@@ -397,7 +397,7 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct 
net_device *dev)
u16 pi;
 
sq = priv->txq2sq[skb_get_queue_mapping(skb)];
-   mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
+   wqe = mlx5e_sq_fetch_wqe(sq, sizeof(*wqe), &pi);
 
/* might send skbs and update wqe and pi */
skb = mlx5e_accel_handle_tx(skb, sq, dev, &wqe, &pi);
-- 
1.8.3.1

[PATCH net-next 12/12] net/mlx5e: Add kTLS TX HW offload support

2019-07-05 Thread Tariq Toukan

Add support for transmit side kernel-TLS acceleration.
Offload the crypto encryption to HW.

Per TLS connection:
- Use a separate TIS to maintain the HW context.
- Use a separate encryption key.
- Maintain static and progress HW contexts by posting the proper
  WQEs at creation time, or upon resync.
- Use a special DUMP opcode to replay the previous frags and sync
  the HW context.

To make sure the SQ is able to serve an xmit request, increase
SQ stop room to cover:
- static params WQE,
- progress params WQE, and
- resync DUMP per frag.

Currently supporting TLS 1.2, and key size 128bit.

Tested over SimX simulator.

Signed-off-by: Tariq Toukan 
Signed-off-by: Eran Ben Elisha 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |   8 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h  |   8 +-
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.c|  93 +
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.h|  97 +
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 459 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c |   5 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h |  11 +-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  28 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  14 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c|  15 +
 drivers/net/ethernet/mellanox/mlx5/core/fw.c   |   6 +
 14 files changed, 748 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 5a1ee9ec8659..57d2cc666fe3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -62,4 +62,5 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o 
fpga/conn.o fpga/sdk.o
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 en_accel/ipsec_stats.o
 
-mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/tls.o en_accel/tls_rxtx.o 
en_accel/tls_stats.o
+mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/tls.o en_accel/tls_rxtx.o 
en_accel/tls_stats.o \
+  en_accel/ktls.o en_accel/ktls_tx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d3d2733917ff..263558875f20 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -209,7 +209,10 @@ struct mlx5e_umr_wqe {
struct mlx5_wqe_ctrl_seg   ctrl;
struct mlx5_wqe_umr_ctrl_seg   uctrl;
struct mlx5_mkey_seg   mkc;
-   struct mlx5_mttinline_mtts[0];
+   union {
+   struct mlx5_mttinline_mtts[0];
+   u8 tls_static_params_ctx[0];
+   };
 };
 
 extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
@@ -333,6 +336,9 @@ struct mlx5e_tx_wqe_info {
u32 num_bytes;
u8  num_wqebbs;
u8  num_dma;
+#ifdef CONFIG_MLX5_EN_TLS
+   skb_frag_t *resync_dump_frag;
+#endif
 };
 
 enum mlx5e_dma_map_type {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index ef16f9e41cf4..ddfe19adb3d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -15,9 +15,15 @@
 #else
 /* TLS offload requires additional stop_room for:
  *  - a resync SKB.
+ * kTLS offload requires additional stop_room for:
+ * - static params WQE,
+ * - progress params WQE, and
+ * - resync DUMP per frag.
  */
 #define MLX5E_SQ_TLS_ROOM  \
-   (MLX5_SEND_WQE_MAX_WQEBBS)
+   (MLX5_SEND_WQE_MAX_WQEBBS + \
+MLX5E_KTLS_STATIC_WQEBBS + MLX5E_KTLS_PROGRESS_WQEBBS + \
+MAX_SKB_FRAGS * MLX5E_KTLS_MAX_DUMP_WQEBBS)
 #endif
 
 #define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg 
*)NULL)->inline_hdr.start))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
new file mode 100644
index ..d2ff74d52720
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2019 Mellanox Technologies.
+
+#include "en.h"
+#include "en_accel/ktls.h"
+
+static int mlx5e_ktls_create_tis(struct mlx5_core_dev *mdev, u32 *tisn)
+{
+   u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
+   void *tisc;
+
+   tisc = MLX5_ADDR_OF(create_tis_in, in

[PATCH net-next 00/12] mlx5 TLS TX HW offload support

2019-07-05 Thread Tariq Toukan

Hi Dave,

This series from Eran and me, adds TLS TX HW offload support to
the mlx5 driver.

This offloads the kTLS encryption process from kernel to the 
Mellanox NIC, saving CPU cycles and improving utilization.

Upon a new TLS connection request, driver is responsible to create
a dedicated HW context and configure it according to the crypto info,
so HW can do the encryption itself.

When the HW context gets out-of-sync (i.e. due to packets retransmission),
driver is responsible for the re-sync process.
This is done by posting special resync descriptors to the HW.

Feature is supported on Mellanox Connect-X 6DX, and newer.
Series was tested on SimX simulator.

Series generated against net-next commit [1], with Saeed's request pulled [2]:

[1] c4cde5804d51 Merge 
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
[2] git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2019-07-04-v2

Changes from last pull request:
Fixed comments from Jakub:
Patch 4:
- Replace zero  memset with a call to memzero_explicit().
Patch 11:
- Fix stats counters names.
- Drop TLS SKB with non-matching netdev.

Regards,
Tariq

Eran Ben Elisha (1):
  net/mlx5e: Tx, Don't implicitly assume SKB-less wqe has one WQEBB

Tariq Toukan (11):
  net/mlx5: Accel, Expose accel wrapper for IPsec FPGA function
  net/mlx5: Kconfig, Better organize compilation flags
  net/mlx5: Add crypto library to support create/destroy encryption key
  net/mlx5: Accel, Add core TLS support for the Connect-X family
  net/mlx5e: Move helper functions to a new txrx datapath header
  net/mlx5e: Tx, Enforce L4 inline copy when needed
  net/mlx5e: Tx, Make SQ WQE fetch function type generic
  net/mlx5e: Tx, Unconstify SQ stop room
  net/mlx5e: Re-work TIS creation functions
  net/mlx5e: Introduce a fenced NOP WQE posting function
  net/mlx5e: Add kTLS TX HW offload support

 drivers/net/ethernet/mellanox/mlx5/core/Kconfig|  52 ++-
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  10 +-
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.c  |   9 +
 .../net/ethernet/mellanox/mlx5/core/accel/ipsec.h  |   7 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c|  45 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h|  51 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 114 +
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h  | 208 ++
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h   |   1 +
 .../mellanox/mlx5/core/en_accel/en_accel.h |   1 +
 .../mellanox/mlx5/core/en_accel/ipsec_rxtx.h   |   1 +
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.c|  93 +
 .../ethernet/mellanox/mlx5/core/en_accel/ktls.h|  97 +
 .../ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 459 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c |   5 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h |  11 +-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c |   7 +-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.h |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  27 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  28 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  14 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c|  98 ++---
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.h   |  75 
 drivers/net/ethernet/mellanox/mlx5/core/fw.c   |   6 +
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  14 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  |   2 +
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/crypto.c   |  72 
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c |   2 +-
 include/linux/mlx5/accel.h |   2 +-
 31 files changed, 1232 insertions(+), 287 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c

-- 
1.8.3.1

Re: [PATCH net] ipv4: Fix NULL pointer dereference in ipv4_neigh_lookup()

2019-07-05 Thread David Ahern

On 7/4/19 1:24 PM, David Miller wrote:
> From: Ido Schimmel 
> Date: Thu,  4 Jul 2019 19:26:38 +0300
> 
>> Both ip_neigh_gw4() and ip_neigh_gw6() can return either a valid pointer
>> or an error pointer, but the code currently checks that the pointer is
>> not NULL.
>  ...
>> @@ -447,7 +447,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct 
>> dst_entry *dst,
>>  n = ip_neigh_gw4(dev, pkey);
>>  }
>>  
>> -if (n && !refcount_inc_not_zero(&n->refcnt))
>> +if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
>>  n = NULL;
>>  
>>  rcu_read_unlock_bh();
> 
> Don't the callers expect only non-error pointers?
> 
> All of this stuff is so confusing and fragile...
> 

The intention was to fold the lookup and neigh_create calls into a
single helper.

The lookup can return NULL if an entry does not exist; the create can
return an ERR_PTR (variety of reasons in ___neigh_create). So the end
result is that the new helper (lookup + create) can return a valid neigh
entry or an ERR_PTR.

When I converted ipv4_neigh_lookup and folded in the refcount bump, I
missed updating the above check to account for ERR_PTR.

Ido's patch looks correct to me. Thanks, Ido.

Reviewed-by: David Ahern

Re: [selftests/bpf] 6135bdd95f: kernel_selftests.bpf.test_offload.py.fail

2019-07-05 Thread Andrii Nakryiko

On Fri, Jul 5, 2019 at 12:43 AM kernel test robot  wrote:
>
> FYI, we noticed the following commit (built with gcc-7):
>
> commit: 6135bdd95f26fe417db4e46d1e517de41e0ab9c1 ("[PATCH v2 bpf-next 4/4] 
> selftests/bpf: convert legacy BPF maps to BTF-defined ones")
> url: 
> https://github.com/0day-ci/linux/commits/Andrii-Nakryiko/capture-integers-in-BTF-type-info-for-map-defs/20190701-041153
> base: 
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git 
> master
>
> in testcase: kernel_selftests
> with following parameters:
>
> group: kselftests-00
>
> test-description: The kernel contains a set of "self tests" under the 
> tools/testing/selftests/ directory. These are intended to be small unit tests 
> to exercise individual code paths in the kernel.
> test-url: https://www.kernel.org/doc/Documentation/kselftest.txt
>
>
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G
>
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
>
>
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
>
>
> # selftests: bpf: test_offload.py
> # Test destruction of generic XDP...
> # Test TC non-offloaded...
> # Test TC non-offloaded isn't getting bound...
> # Test TC offloads are off by default...
> # Test TC offload by default...
> # Test TC cBPF bytcode tries offload by default...
> # Test TC cBPF unbound bytecode doesn't offload...
> # Test non-0 chain offload...
> # Test TC replace...
> # Test TC replace bad flags...
> # Test spurious extack from the driver...
> # Test TC offloads work...
> # Test TC offload basics...
> # Test TC offload is device-bound...
> # Test disabling TC offloads is rejected while filters installed...
> # Test qdisc removal frees things...
> # Test disabling TC offloads is OK without filters...
> # Test destroying device gets rid of TC filters...
> # Test destroying device gets rid of XDP...
> # Test XDP prog reporting...
> # Test XDP prog replace without force...
> # Test XDP prog replace with force...
> # Test XDP prog replace with bad flags...
> # Test XDP prog remove with bad flags...
> # Test MTU restrictions...
> # Test non-offload XDP attaching to HW...
> # Test offload XDP attaching to drv...
> # Test XDP offload...
> # Test XDP offload is device bound...
> # Test removing XDP program many times...
> # Test attempt to use a program for a wrong device...
> # Test multi-attachment XDP - default + offload...
> # Test multi-attachment XDP - replace...
> # Test multi-attachment XDP - detach...
> # Test multi-attachment XDP - reattach...
> # Test multi-attachment XDP - device remove...
> # Test multi-attachment XDP - drv + offload...
> # Test multi-attachment XDP - replace...
> # Test multi-attachment XDP - detach...
> # Test multi-attachment XDP - reattach...
> # Test multi-attachment XDP - device remove...
> # Test multi-attachment XDP - generic + offload...
> # Test multi-attachment XDP - replace...
> # Test multi-attachment XDP - reattach...
> # Test multi-attachment XDP - device remove...
> # Test mixing of TC and XDP...
> # Test binding TC from pinned...
> # Test binding XDP from pinned...
> # Test offload of wrong type fails...
> # Test asking for TC offload of two filters...
> # Test if netdev removal waits for translation...
> # Test loading program with maps...
> # Traceback (most recent call last):
> #   File "./test_offload.py", line 1153, in 
> # sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
> #   File "./test_offload.py", line 469, in set_xdp
> # fail=fail, include_stderr=include_stderr)
> #   File "./test_offload.py", line 230, in ip
> # fail=fail, include_stderr=include_stderr)
> #   File "./test_offload.py", line 155, in tool
> # fail=fail, include_stderr=False)
> #   File "./test_offload.py", line 108, in cmd
> # return cmd_result(proc, include_stderr=include_stderr, fail=fail)
> #   File "./test_offload.py", line 130, in cmd_result
> # raise Exception("Command failed: %s\n%s" % (proc.args, stderr))
> # Exception: Command failed: ip link set dev eth1 xdpoffload obj 
> /usr/src/perf_selftests-x86_64-rhel-7.6-6135bdd95f26fe417db4e46d1e517de41e0ab9c1/tools/testing/selftests/bpf/sample_map_ret0.o
>  sec .text

We can't yet convert BPF programs that are loaded with iproute2 to new
BTF-defined maps, until iprout2 uses libbpf as a loader. I missed that
sample_map_ret0.c is used with iproute2, will undo conversion for it.

Thanks!

> #
> #
> # BTF debug data section '.BTF' rejected: Invalid argument (22)!
> #  - Length:   811
> # Verifier analysis:
> #
> # magic: 0xeb9f
> # version: 1
> # flags: 0x0
> # hdr_len: 24
> # type_off: 0
> # type_len: 384
> # str_off: 384
> # str_len: 403
> # btf_total_size: 811
> # [1] FUNC_PROTO (anon) return=2 args=(void)
> # [2] INT int size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
> # [3] FUNC func type_id=1
> # [4] STRUCT (anon) size=32 vlen=4
> #   type type_id=5 bits_offse

Re: [PATCH v3 1/2] Documentation: net: dsa: Describe DSA switch configuration

2019-07-05 Thread Florian Fainelli




On 7/5/2019 2:57 AM, Benedikt Spranger wrote:
> Document DSA tagged and VLAN based switch configuration by showcases.
> 
> Signed-off-by: Benedikt Spranger 
> Reviewed-by: Andrew Lunn 

Reviewed-by: Florian Fainelli 
-- 
Florian

Re: [PATCH v3 2/2] Documentation: net: dsa: b53: Describe b53 configuration

2019-07-05 Thread Florian Fainelli




On 7/5/2019 2:57 AM, Benedikt Spranger wrote:
> Document the different needs of documentation for the b53 driver.
> 
> Signed-off-by: Benedikt Spranger 

Reviewed-by: Florian Fainelli 
-- 
Florian

Re: [PATCH bpf-next] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Y Song

On Fri, Jul 5, 2019 at 1:21 AM Quentin Monnet
 wrote:
>
> 2019-07-04 22:49 UTC-0700 ~ Y Song 
> > On Thu, Jul 4, 2019 at 1:58 AM Quentin Monnet
> >  wrote:
> >>
> >> Add a new "bpftool prog run" subcommand to run a loaded program on input
> >> data (and possibly with input context) passed by the user.
> >>
> >> Print output data (and output context if relevant) into a file or into
> >> the console. Print return value and duration for the test run into the
> >> console.
> >>
> >> A "repeat" argument can be passed to run the program several times in a
> >> row.
> >>
> >> The command does not perform any kind of verification based on program
> >> type (Is this program type allowed to use an input context?) or on data
> >> consistency (Can I work with empty input data?), this is left to the
> >> kernel.
> >>
> >> Example invocation:
> >>
> >> # perl -e 'print "\x0" x 14' | ./bpftool prog run \
> >> pinned /sys/fs/bpf/sample_ret0 \
> >> data_in - data_out - repeat 5
> >> 000         |  ..
> >> Return value: 0, duration (average): 260ns
> >>
> >> When one of data_in or ctx_in is "-", bpftool reads from standard input,
> >> in binary format. Other formats (JSON, hexdump) might be supported (via
> >> an optional command line keyword like "data_fmt_in") in the future if
> >> relevant, but this would require doing more parsing in bpftool.
> >>
> >> Signed-off-by: Quentin Monnet 
> >> Reviewed-by: Jakub Kicinski 
> >> ---
>
> [...]
>
> >> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> >> index 9b0db5d14e31..8dcbaa0a8ab1 100644
> >> --- a/tools/bpf/bpftool/prog.c
> >> +++ b/tools/bpf/bpftool/prog.c
> >> @@ -15,6 +15,7 @@
> >>  #include 
> >>
> >>  #include 
> >> +#include 
> >>
> >>  #include 
> >>  #include 
> >> @@ -748,6 +749,344 @@ static int do_detach(int argc, char **argv)
> >> return 0;
> >>  }
> >>
> >> +static int check_single_stdin(char *file_in, char *other_file_in)
> >> +{
> >> +   if (file_in && other_file_in &&
> >> +   !strcmp(file_in, "-") && !strcmp(other_file_in, "-")) {
> >> +   p_err("cannot use standard input for both data_in and 
> >> ctx_in");
> >
> > The error message says data_in and ctx_in.
> > Maybe the input parameter should be file_data_in and file_ctx_in?
>
>
> Hi Yonghong,
>
> It's true those parameters should be file names. But having
> "file_data_in", "file_data_out", "file_ctx_in" and "file_ctx_out" on a
> command line seems a bit heavy to me? (And relying on keyword prefixing
> for typing the command won't help much.)
>
> My opinion is that it should be clear from the man page or the "help"
> command that the parameters are file names. What do you think? I can
> prefix all four arguments with "file_" if you believe this is better.

I think you misunderstood my question above. The command line
parameters are fine.
I am talking about the function parameter names. Since in the error message,
the input parameters are referred for data_in and ctx_in
   p_err("cannot use standard input for both data_in and ctx_in")
maybe the function signature should be
  static int check_single_stdin(char *file_data_in, char *file_ctx_in)

If you are worried that later on the same function can be used in different
contexts, then alternatively, you can have signature like
  static int check_single_stdin(char *file_in, char *other_file_in,
const char *file_in_arg, const char *other_file_in_arg)
where file_in_arg will be passed in as "data_in" and other_file_in_arg
as "ctx_in".
I think we could delay this until it is really needed.

>
> [...]
>
> >> +static int do_run(int argc, char **argv)
> >> +{
> >> +   char *data_fname_in = NULL, *data_fname_out = NULL;
> >> +   char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
> >> +   struct bpf_prog_test_run_attr test_attr = {0};
> >> +   const unsigned int default_size = SZ_32K;
> >> +   void *data_in = NULL, *data_out = NULL;
> >> +   void *ctx_in = NULL, *ctx_out = NULL;
> >> +   unsigned int repeat = 1;
> >> +   int fd, err;
> >> +
> >> +   if (!REQ_ARGS(4))
> >> +   return -1;
> >> +
> >> +   fd = prog_parse_fd(&argc, &argv);
> >> +   if (fd < 0)
> >> +   return -1;
> >> +
> >> +   while (argc) {
> >> +   if (detect_common_prefix(*argv, "data_in", "data_out",
> >> +"data_size_out", NULL))
> >> +   return -1;
> >> +   if (detect_common_prefix(*argv, "ctx_in", "ctx_out",
> >> +"ctx_size_out", NULL))
> >> +   return -1;
> >> +
> >> +   if (is_prefix(*argv, "data_in")) {
> >> +   NEXT_ARG();
> >> +   if (!REQ_ARGS(1))
> >> +   return -1;
> >> +
> >> +   data_fname_in = GET_ARG();
> >> +   if (check_sing

Re: [PATCH rdma-next v5 00/17] Statistics counter support

2019-07-05 Thread Jason Gunthorpe

On Tue, Jul 02, 2019 at 01:02:29PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> Changelog:
>  v4 -> v5:
>  * Patch #6 and #14 - consolidated many counter release functions,
>removed mutex lock protection from dealloc_counter() call
>and simplified kref_put/kref_get operations.
>  * Added Saeed's ACK tags.
>  v3 -> v4:
>  * Add counter_dealloc() callback function
>  * Moved to kref implementation
>  * Fixed lock during spinlock
>  v2 -> v3:
>  * We didn't change use of atomics over kref for management of unbind
>counter from QP. The reason to it that bind and unbind are non-symmetric
>in regards of put and get, so we need to count differently memory
>release flows of HW objects (restrack) and SW bind operations.
>  * Everything else was addressed.
>  v1 -> v2:
>  * Rebased to latest rdma-next
>  v0 -> v1:
>  * Changed wording of counter comment
>  * Removed unneeded assignments
>  * Added extra patch to present global counters
> 
> 
> Hi,
> 
> This series from Mark provides dynamic statistics infrastructure.
> He uses netlink interface to configure and retrieve those counters.
> 
> This infrastructure allows to users monitor various objects by binding
> to them counters. As the beginning, we used QP object as target for
> those counters, but future patches will include ODP MR information too.
> 
> Two binding modes are supported:
>  - Auto: This allows a user to build automatic set of objects to a counter
>according to common criteria. For example in a per-type scheme, where in
>one process all QPs with same QP type are bound automatically to a single
>counter.
>  - Manual: This allows a user to manually bind objects on a counter.
> 
> Those two modes are mutual-exclusive with separation between processes,
> objects created by different processes cannot be bound to a same counter.
> 
> For objects which don't support counter binding, we will return
> pre-allocated counters.
> 
> $ rdma statistic qp set link mlx5_2/1 auto type on
> $ rdma statistic qp set link mlx5_2/1 auto off
> $ rdma statistic qp bind link mlx5_2/1 lqpn 178
> $ rdma statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178
> $ rdma statistic show
> $ rdma statistic qp mode
> 
> Thanks
> 
> 
> Mark Zhang (17):
>   net/mlx5: Add rts2rts_qp_counters_set_id field in hca cap
>   RDMA/restrack: Introduce statistic counter
>   RDMA/restrack: Add an API to attach a task to a resource
>   RDMA/restrack: Make is_visible_in_pid_ns() as an API
>   RDMA/counter: Add set/clear per-port auto mode support
>   RDMA/counter: Add "auto" configuration mode support
>   IB/mlx5: Support set qp counter
>   IB/mlx5: Add counter set id as a parameter for
> mlx5_ib_query_q_counters()
>   IB/mlx5: Support statistic q counter configuration
>   RDMA/nldev: Allow counter auto mode configration through RDMA netlink
>   RDMA/netlink: Implement counter dumpit calback
>   IB/mlx5: Add counter_alloc_stats() and counter_update_stats() support
>   RDMA/core: Get sum value of all counters when perform a sysfs stat
> read
>   RDMA/counter: Allow manual mode configuration support
>   RDMA/nldev: Allow counter manual mode configration through RDMA
> netlink
>   RDMA/nldev: Allow get counter mode through RDMA netlink
>   RDMA/nldev: Allow get default counter statistics through RDMA netlink

Okay, applied to for-next

Thanks,
Jason

[PATCH v5 bpf-next 1/4] libbpf: capture value in BTF type info for BTF-defined map defs

2019-07-05 Thread Andrii Nakryiko

Change BTF-defined map definitions to capture compile-time integer
values as part of BTF type definition, to avoid split of key/value type
information and actual type/size/flags initialization for maps.

Signed-off-by: Andrii Nakryiko 
Acked-by: Song Liu 
Acked-by: Yonghong Song 
---
 tools/lib/bpf/libbpf.c | 58 --
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4907997289e9..fad8901ee774 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1028,40 +1028,40 @@ static const struct btf_type 
*skip_mods_and_typedefs(const struct btf *btf,
}
 }
 
-static bool get_map_field_int(const char *map_name,
- const struct btf *btf,
+/*
+ * Fetch integer attribute of BTF map definition. Such attributes are
+ * represented using a pointer to an array, in which dimensionality of array
+ * encodes specified integer value. E.g., int (*type)[BPF_MAP_TYPE_ARRAY];
+ * encodes `type => BPF_MAP_TYPE_ARRAY` key/value pair completely using BTF
+ * type definition, while using only sizeof(void *) space in ELF data section.
+ */
+static bool get_map_field_int(const char *map_name, const struct btf *btf,
  const struct btf_type *def,
- const struct btf_member *m,
- const void *data, __u32 *res) {
+ const struct btf_member *m, __u32 *res) {
const struct btf_type *t = skip_mods_and_typedefs(btf, m->type);
const char *name = btf__name_by_offset(btf, m->name_off);
-   __u32 int_info = *(const __u32 *)(const void *)(t + 1);
+   const struct btf_array *arr_info;
+   const struct btf_type *arr_t;
 
-   if (BTF_INFO_KIND(t->info) != BTF_KIND_INT) {
-   pr_warning("map '%s': attr '%s': expected INT, got %u.\n",
+   if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) {
+   pr_warning("map '%s': attr '%s': expected PTR, got %u.\n",
   map_name, name, BTF_INFO_KIND(t->info));
return false;
}
-   if (t->size != 4 || BTF_INT_BITS(int_info) != 32 ||
-   BTF_INT_OFFSET(int_info)) {
-   pr_warning("map '%s': attr '%s': expected 32-bit non-bitfield 
integer, "
-  "got %u-byte (%d-bit) one with bit offset %d.\n",
-  map_name, name, t->size, BTF_INT_BITS(int_info),
-  BTF_INT_OFFSET(int_info));
-   return false;
-   }
-   if (BTF_INFO_KFLAG(def->info) && BTF_MEMBER_BITFIELD_SIZE(m->offset)) {
-   pr_warning("map '%s': attr '%s': bitfield is not supported.\n",
-  map_name, name);
+
+   arr_t = btf__type_by_id(btf, t->type);
+   if (!arr_t) {
+   pr_warning("map '%s': attr '%s': type [%u] not found.\n",
+  map_name, name, t->type);
return false;
}
-   if (m->offset % 32) {
-   pr_warning("map '%s': attr '%s': unaligned fields are not 
supported.\n",
-  map_name, name);
+   if (BTF_INFO_KIND(arr_t->info) != BTF_KIND_ARRAY) {
+   pr_warning("map '%s': attr '%s': expected ARRAY, got %u.\n",
+  map_name, name, BTF_INFO_KIND(arr_t->info));
return false;
}
-
-   *res = *(const __u32 *)(data + m->offset / 8);
+   arr_info = (const void *)(arr_t + 1);
+   *res = arr_info->nelems;
return true;
 }
 
@@ -1074,7 +1074,6 @@ static int bpf_object__init_user_btf_map(struct 
bpf_object *obj,
const struct btf_var_secinfo *vi;
const struct btf_var *var_extra;
const struct btf_member *m;
-   const void *def_data;
const char *map_name;
struct bpf_map *map;
int vlen, i;
@@ -1131,7 +1130,6 @@ static int bpf_object__init_user_btf_map(struct 
bpf_object *obj,
pr_debug("map '%s': at sec_idx %d, offset %zu.\n",
 map_name, map->sec_idx, map->sec_offset);
 
-   def_data = data->d_buf + vi->offset;
vlen = BTF_INFO_VLEN(def->info);
m = (const void *)(def + 1);
for (i = 0; i < vlen; i++, m++) {
@@ -1144,19 +1142,19 @@ static int bpf_object__init_user_btf_map(struct 
bpf_object *obj,
}
if (strcmp(name, "type") == 0) {
if (!get_map_field_int(map_name, obj->btf, def, m,
-  def_data, &map->def.type))
+  &map->def.type))
return -EINVAL;
pr_debug("map '%s': found type = %u.\n",
 map_name, map->def.type);
} else if (strcmp(name, "max_entries") == 0) {
if (!get_map_field_int(map_name, obj->btf, def, m,
-

[PATCH v5 bpf-next 0/4] capture integers in BTF type info for map defs

2019-07-05 Thread Andrii Nakryiko

This patch set implements an update to how BTF-defined maps are specified. The
change is in how integer attributes, e.g., type, max_entries, map_flags, are
specified: now they are captured as part of map definition struct's BTF type
information (using array dimension), eliminating the need for compile-time
data initialization and keeping all the metadata in one place.

All existing selftests that were using BTF-defined maps are updated, along
with some other selftests, that were switched to new syntax.

v4->v5:
- revert sample_map_ret0.c, which is loaded with iproute2 (kernel test robot);
v3->v4:
- add acks;
- fix int -> uint type in commit message;
v2->v3:
- rename __int into __uint (Yonghong);
v1->v2:
- split bpf_helpers.h change from libbpf change (Song).

Andrii Nakryiko (4):
  libbpf: capture value in BTF type info for BTF-defined map defs
  selftests/bpf: add __uint and __type macro for BTF-defined maps
  selftests/bpf: convert selftests using BTF-defined maps to new syntax
  selftests/bpf: convert legacy BPF maps to BTF-defined ones

 tools/lib/bpf/libbpf.c|  58 +
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  28 ++---
 .../selftests/bpf/progs/get_cgroup_id_kern.c  |  26 ++---
 .../testing/selftests/bpf/progs/netcnt_prog.c |  20 ++--
 tools/testing/selftests/bpf/progs/pyperf.h|  90 +++---
 .../selftests/bpf/progs/socket_cookie_prog.c  |  13 +--
 .../bpf/progs/sockmap_verdict_prog.c  |  48 
 .../testing/selftests/bpf/progs/strobemeta.h  |  68 +--
 .../selftests/bpf/progs/test_btf_newkv.c  |  13 +--
 .../bpf/progs/test_get_stack_rawtp.c  |  39 +++
 .../selftests/bpf/progs/test_global_data.c|  37 +++---
 tools/testing/selftests/bpf/progs/test_l4lb.c |  65 ---
 .../selftests/bpf/progs/test_l4lb_noinline.c  |  65 ---
 .../selftests/bpf/progs/test_map_in_map.c |  30 ++---
 .../selftests/bpf/progs/test_map_lock.c   |  26 ++---
 .../testing/selftests/bpf/progs/test_obj_id.c |  12 +-
 .../bpf/progs/test_select_reuseport_kern.c|  67 ---
 .../bpf/progs/test_send_signal_kern.c |  26 ++---
 .../bpf/progs/test_sock_fields_kern.c |  78 +
 .../selftests/bpf/progs/test_spin_lock.c  |  36 +++---
 .../bpf/progs/test_stacktrace_build_id.c  |  55 -
 .../selftests/bpf/progs/test_stacktrace_map.c |  52 +++--
 .../selftests/bpf/progs/test_tcp_estats.c |  13 +--
 .../selftests/bpf/progs/test_tcpbpf_kern.c|  26 ++---
 .../selftests/bpf/progs/test_tcpnotify_kern.c |  28 ++---
 tools/testing/selftests/bpf/progs/test_xdp.c  |  26 ++---
 .../selftests/bpf/progs/test_xdp_loop.c   |  26 ++---
 .../selftests/bpf/progs/test_xdp_noinline.c   |  81 +
 .../selftests/bpf/progs/xdp_redirect_map.c|  12 +-
 .../testing/selftests/bpf/progs/xdping_kern.c |  12 +-
 .../selftests/bpf/test_queue_stack_map.h  |  30 ++---
 .../testing/selftests/bpf/test_sockmap_kern.h | 110 +-
 33 files changed, 559 insertions(+), 760 deletions(-)

-- 
2.17.1

[PATCH v5 bpf-next 2/4] selftests/bpf: add uint and type macro for BTF-defined maps

2019-07-05 Thread Andrii Nakryiko

Add simple __uint and __type macro that hide details of how type and
integer values are captured in BTF-defined maps.

Signed-off-by: Andrii Nakryiko 
Acked-by: Song Liu 
Acked-by: Yonghong Song 
---
 tools/testing/selftests/bpf/bpf_helpers.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
b/tools/testing/selftests/bpf/bpf_helpers.h
index 1a5b1accf091..5a3d92c8bec8 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -8,6 +8,9 @@
  */
 #define SEC(NAME) __attribute__((section(NAME), used))
 
+#define __uint(name, val) int (*name)[val]
+#define __type(name, val) val *name
+
 /* helper macro to print out debug messages */
 #define bpf_printk(fmt, ...)   \
 ({ \
-- 
2.17.1

[PATCH v5 bpf-next 4/4] selftests/bpf: convert legacy BPF maps to BTF-defined ones

2019-07-05 Thread Andrii Nakryiko

Convert selftests that were originally left out and new ones added
recently to consistently use BTF-defined maps.

Reported-by: kernel test robot 
Signed-off-by: Andrii Nakryiko 
Acked-by: Song Liu 
Acked-by: Yonghong Song 
---
 .../selftests/bpf/progs/get_cgroup_id_kern.c  |  26 ++---
 tools/testing/selftests/bpf/progs/pyperf.h|  90 +++---
 .../bpf/progs/sockmap_verdict_prog.c  |  48 
 .../testing/selftests/bpf/progs/strobemeta.h  |  68 +--
 .../selftests/bpf/progs/test_map_in_map.c |  30 ++---
 .../testing/selftests/bpf/progs/test_obj_id.c |  12 +-
 .../selftests/bpf/progs/test_xdp_loop.c   |  26 ++---
 .../selftests/bpf/progs/xdp_redirect_map.c|  12 +-
 .../testing/selftests/bpf/progs/xdping_kern.c |  12 +-
 .../selftests/bpf/test_queue_stack_map.h  |  30 ++---
 .../testing/selftests/bpf/test_sockmap_kern.h | 110 +-
 11 files changed, 228 insertions(+), 236 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c 
b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
index 014dba10b8a5..16c54ade6888 100644
--- a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
+++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
@@ -4,19 +4,19 @@
 #include 
 #include "bpf_helpers.h"
 
-struct bpf_map_def SEC("maps") cg_ids = {
-   .type = BPF_MAP_TYPE_ARRAY,
-   .key_size = sizeof(__u32),
-   .value_size = sizeof(__u64),
-   .max_entries = 1,
-};
-
-struct bpf_map_def SEC("maps") pidmap = {
-   .type = BPF_MAP_TYPE_ARRAY,
-   .key_size = sizeof(__u32),
-   .value_size = sizeof(__u32),
-   .max_entries = 1,
-};
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __uint(max_entries, 1);
+   __type(key, __u32);
+   __type(value, __u64);
+} cg_ids SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __uint(max_entries, 1);
+   __type(key, __u32);
+   __type(value, __u32);
+} pidmap SEC(".maps");
 
 SEC("tracepoint/syscalls/sys_enter_nanosleep")
 int trace(void *ctx)
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h 
b/tools/testing/selftests/bpf/progs/pyperf.h
index abf6224649be..003fe106fc70 100644
--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -58,14 +58,6 @@ typedef struct {
 } Event;
 
 
-struct bpf_elf_map {
-   __u32 type;
-   __u32 size_key;
-   __u32 size_value;
-   __u32 max_elem;
-   __u32 flags;
-};
-
 typedef int pid_t;
 
 typedef struct {
@@ -118,47 +110,47 @@ static __always_inline bool get_frame_data(void 
*frame_ptr, PidData *pidData,
return true;
 }
 
-struct bpf_elf_map SEC("maps") pidmap = {
-   .type = BPF_MAP_TYPE_HASH,
-   .size_key = sizeof(int),
-   .size_value = sizeof(PidData),
-   .max_elem = 1,
-};
-
-struct bpf_elf_map SEC("maps") eventmap = {
-   .type = BPF_MAP_TYPE_HASH,
-   .size_key = sizeof(int),
-   .size_value = sizeof(Event),
-   .max_elem = 1,
-};
-
-struct bpf_elf_map SEC("maps") symbolmap = {
-   .type = BPF_MAP_TYPE_HASH,
-   .size_key = sizeof(Symbol),
-   .size_value = sizeof(int),
-   .max_elem = 1,
-};
-
-struct bpf_elf_map SEC("maps") statsmap = {
-   .type = BPF_MAP_TYPE_ARRAY,
-   .size_key = sizeof(Stats),
-   .size_value = sizeof(int),
-   .max_elem = 1,
-};
-
-struct bpf_elf_map SEC("maps") perfmap = {
-   .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-   .size_key = sizeof(int),
-   .size_value = sizeof(int),
-   .max_elem = 32,
-};
-
-struct bpf_elf_map SEC("maps") stackmap = {
-   .type = BPF_MAP_TYPE_STACK_TRACE,
-   .size_key = sizeof(int),
-   .size_value = sizeof(long long) * 127,
-   .max_elem = 1000,
-};
+struct {
+   __uint(type, BPF_MAP_TYPE_HASH);
+   __uint(max_entries, 1);
+   __type(key, int);
+   __type(value, PidData);
+} pidmap SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_HASH);
+   __uint(max_entries, 1);
+   __type(key, int);
+   __type(value, Event);
+} eventmap SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_HASH);
+   __uint(max_entries, 1);
+   __type(key, Symbol);
+   __type(value, int);
+} symbolmap SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __uint(max_entries, 1);
+   __type(key, int);
+   __type(value, Stats);
+} statsmap SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+   __uint(max_entries, 32);
+   __uint(key_size, sizeof(int));
+   __uint(value_size, sizeof(int));
+} perfmap SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+   __uint(max_entries, 1000);
+   __uint(key_size, sizeof(int));
+   __uint(value_size, sizeof(long long) * 127);
+} stackmap SEC(".maps");
 
 static __always_inline int __on_event(struct pt_regs *ctx)
 {
diff --git a/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c 
b/t

[PATCH v5 bpf-next 3/4] selftests/bpf: convert selftests using BTF-defined maps to new syntax

2019-07-05 Thread Andrii Nakryiko

Convert all the existing selftests that are already using BTF-defined
maps to use new syntax (with no static data initialization).

Signed-off-by: Andrii Nakryiko 
Acked-by: Song Liu 
Acked-by: Yonghong Song 
---
 tools/testing/selftests/bpf/progs/bpf_flow.c  | 28 +++
 .../testing/selftests/bpf/progs/netcnt_prog.c | 20 ++---
 .../selftests/bpf/progs/socket_cookie_prog.c  | 13 ++-
 .../selftests/bpf/progs/test_btf_newkv.c  | 13 ++-
 .../bpf/progs/test_get_stack_rawtp.c  | 39 -
 .../selftests/bpf/progs/test_global_data.c| 37 -
 tools/testing/selftests/bpf/progs/test_l4lb.c | 65 ++-
 .../selftests/bpf/progs/test_l4lb_noinline.c  | 65 ++-
 .../selftests/bpf/progs/test_map_lock.c   | 26 +++---
 .../bpf/progs/test_select_reuseport_kern.c| 67 ++-
 .../bpf/progs/test_send_signal_kern.c | 26 +++---
 .../bpf/progs/test_sock_fields_kern.c | 78 +++---
 .../selftests/bpf/progs/test_spin_lock.c  | 36 -
 .../bpf/progs/test_stacktrace_build_id.c  | 55 +
 .../selftests/bpf/progs/test_stacktrace_map.c | 52 +---
 .../selftests/bpf/progs/test_tcp_estats.c | 13 ++-
 .../selftests/bpf/progs/test_tcpbpf_kern.c| 26 +++---
 .../selftests/bpf/progs/test_tcpnotify_kern.c | 28 +++
 tools/testing/selftests/bpf/progs/test_xdp.c  | 26 +++---
 .../selftests/bpf/progs/test_xdp_noinline.c   | 81 +++
 20 files changed, 300 insertions(+), 494 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c 
b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 849f42e548b5..5ae485a6af3f 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -58,26 +58,18 @@ struct frag_hdr {
 };
 
 struct {
-   __u32 type;
-   __u32 max_entries;
-   __u32 key_size;
-   __u32 value_size;
-} jmp_table SEC(".maps") = {
-   .type = BPF_MAP_TYPE_PROG_ARRAY,
-   .max_entries = 8,
-   .key_size = sizeof(__u32),
-   .value_size = sizeof(__u32),
-};
+   __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+   __uint(max_entries, 8);
+   __uint(key_size, sizeof(__u32));
+   __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
 
 struct {
-   __u32 type;
-   __u32 max_entries;
-   __u32 *key;
-   struct bpf_flow_keys *value;
-} last_dissection SEC(".maps") = {
-   .type = BPF_MAP_TYPE_ARRAY,
-   .max_entries = 1,
-};
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __uint(max_entries, 1);
+   __type(key, __u32);
+   __type(value, struct bpf_flow_keys);
+} last_dissection SEC(".maps");
 
 static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
int ret)
diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c 
b/tools/testing/selftests/bpf/progs/netcnt_prog.c
index a25c82a5b7c8..38a997852cad 100644
--- a/tools/testing/selftests/bpf/progs/netcnt_prog.c
+++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c
@@ -11,20 +11,16 @@
 #define NS_PER_SEC 10
 
 struct {
-   __u32 type;
-   struct bpf_cgroup_storage_key *key;
-   struct percpu_net_cnt *value;
-} percpu_netcnt SEC(".maps") = {
-   .type = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
-};
+   __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+   __type(key, struct bpf_cgroup_storage_key);
+   __type(value, struct percpu_net_cnt);
+} percpu_netcnt SEC(".maps");
 
 struct {
-   __u32 type;
-   struct bpf_cgroup_storage_key *key;
-   struct net_cnt *value;
-} netcnt SEC(".maps") = {
-   .type = BPF_MAP_TYPE_CGROUP_STORAGE,
-};
+   __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+   __type(key, struct bpf_cgroup_storage_key);
+   __type(value, struct net_cnt);
+} netcnt SEC(".maps");
 
 SEC("cgroup/skb")
 int bpf_nextcnt(struct __sk_buff *skb)
diff --git a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c 
b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c
index 6aabb681fb9a..e4440fdd94cb 100644
--- a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c
+++ b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c
@@ -13,14 +13,11 @@ struct socket_cookie {
 };
 
 struct {
-   __u32 type;
-   __u32 map_flags;
-   int *key;
-   struct socket_cookie *value;
-} socket_cookies SEC(".maps") = {
-   .type = BPF_MAP_TYPE_SK_STORAGE,
-   .map_flags = BPF_F_NO_PREALLOC,
-};
+   __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+   __uint(map_flags, BPF_F_NO_PREALLOC);
+   __type(key, int);
+   __type(value, struct socket_cookie);
+} socket_cookies SEC(".maps");
 
 SEC("cgroup/connect6")
 int set_cookie(struct bpf_sock_addr *ctx)
diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c 
b/tools/testing/selftests/bpf/progs/test_btf_newkv.c
index 28c16bb583b6..5ee3622ddebb 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_newkv.c
+++ b/tools/testing/selftests/bpf

Re: [RFC net-next] net: dsa: add support for MC_DISABLED attribute

2019-07-05 Thread Vivien Didelot

Hi Ido,

On Sun, 23 Jun 2019 07:09:52 +, Ido Schimmel  wrote:
> > Russell, Ido, Florian, so far I understand that a multicast-unaware
> > bridge must flood unknown traffic everywhere (CPU included);
> > and a multicast-aware bridge must only flood its ports if their
> > mcast_flood is on, and known traffic targeting the bridge must be
> > offloaded accordingly. Do you guys agree with this?
> 
> When multicast snooping is enabled unregistered multicast traffic should
> only be flooded to mrouter ports.

I've figured out that this is what I need to prevent the flooding of undesired
multicast traffic to the CPU port of the switch. The bridge itself has a
multicast_router attribute which can be disabled, that is when I should drop
unknown multicast traffic.

However with SWITCHDEV_ATTR_ID_BRIDGE_MROUTER implemented, this
attribute is always called with .mrouter=0, regardless the value of
/sys/class/net/br0/bridge/multicast_router. Do I miss something here?

Thanks,

Vivien

Re: [PATCH bpf-next] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Quentin Monnet

2019-07-05 08:42 UTC-0700 ~ Y Song 
> On Fri, Jul 5, 2019 at 1:21 AM Quentin Monnet
>  wrote:
>>
>> 2019-07-04 22:49 UTC-0700 ~ Y Song 
>>> On Thu, Jul 4, 2019 at 1:58 AM Quentin Monnet
>>>  wrote:

 Add a new "bpftool prog run" subcommand to run a loaded program on input
 data (and possibly with input context) passed by the user.

 Print output data (and output context if relevant) into a file or into
 the console. Print return value and duration for the test run into the
 console.

 A "repeat" argument can be passed to run the program several times in a
 row.

 The command does not perform any kind of verification based on program
 type (Is this program type allowed to use an input context?) or on data
 consistency (Can I work with empty input data?), this is left to the
 kernel.

 Example invocation:

 # perl -e 'print "\x0" x 14' | ./bpftool prog run \
 pinned /sys/fs/bpf/sample_ret0 \
 data_in - data_out - repeat 5
 000         |  ..
 Return value: 0, duration (average): 260ns

 When one of data_in or ctx_in is "-", bpftool reads from standard input,
 in binary format. Other formats (JSON, hexdump) might be supported (via
 an optional command line keyword like "data_fmt_in") in the future if
 relevant, but this would require doing more parsing in bpftool.

 Signed-off-by: Quentin Monnet 
 Reviewed-by: Jakub Kicinski 
 ---
>>
>> [...]
>>
 diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
 index 9b0db5d14e31..8dcbaa0a8ab1 100644
 --- a/tools/bpf/bpftool/prog.c
 +++ b/tools/bpf/bpftool/prog.c
 @@ -15,6 +15,7 @@
  #include 

  #include 
 +#include 

  #include 
  #include 
 @@ -748,6 +749,344 @@ static int do_detach(int argc, char **argv)
 return 0;
  }

 +static int check_single_stdin(char *file_in, char *other_file_in)
 +{
 +   if (file_in && other_file_in &&
 +   !strcmp(file_in, "-") && !strcmp(other_file_in, "-")) {
 +   p_err("cannot use standard input for both data_in and 
 ctx_in");
>>>
>>> The error message says data_in and ctx_in.
>>> Maybe the input parameter should be file_data_in and file_ctx_in?
>>
>>
>> Hi Yonghong,
>>
>> It's true those parameters should be file names. But having
>> "file_data_in", "file_data_out", "file_ctx_in" and "file_ctx_out" on a
>> command line seems a bit heavy to me? (And relying on keyword prefixing
>> for typing the command won't help much.)
>>
>> My opinion is that it should be clear from the man page or the "help"
>> command that the parameters are file names. What do you think? I can
>> prefix all four arguments with "file_" if you believe this is better.
> 
> I think you misunderstood my question above.

Totally did, sorry :/.

> The command line parameters are fine.
> I am talking about the function parameter names. Since in the error message,
> the input parameters are referred for data_in and ctx_in
>p_err("cannot use standard input for both data_in and ctx_in")
> maybe the function signature should be
>   static int check_single_stdin(char *file_data_in, char *file_ctx_in)
> 
> If you are worried that later on the same function can be used in different
> contexts, then alternatively, you can have signature like
>   static int check_single_stdin(char *file_in, char *other_file_in,
> const char *file_in_arg, const char *other_file_in_arg)
> where file_in_arg will be passed in as "data_in" and other_file_in_arg
> as "ctx_in".
> I think we could delay this until it is really needed.

As a matter of fact, the opposite thing happened. I first used the
function for data_in/ctx_in, and also for data_out/ctx_out. But I
changed my mind eventually because there is no real reason not to print
both data_out and ctx_out to stdout if we want to do so. So I updated
the name of the parameters in the error messages, but forgot to change
the arguments for the function. Silly me.

So I totally agree, I'll respin and change the argument names for the
function. And yes, we could also pass the names to print in the error
message, but I agree that this is not needed, and not helpful at the moment.

Thanks for catching this!

>>
>> [...]
>>
 +static int do_run(int argc, char **argv)
 +{
 +   char *data_fname_in = NULL, *data_fname_out = NULL;
 +   char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
 +   struct bpf_prog_test_run_attr test_attr = {0};
 +   const unsigned int default_size = SZ_32K;
 +   void *data_in = NULL, *data_out = NULL;
 +   void *ctx_in = NULL, *ctx_out = NULL;
 +   unsigned int repeat = 1;
 +   int fd, err;
 +
 +   if (!REQ_ARGS(4))
 +   return -1;
 +
 +   fd = prog_parse_fd(&argc, &argv);
 +   if

[PATCH net-next] net: openvswitch: use netif_ovs_is_port() instead of opencode

2019-07-05 Thread Taehee Yoo

Use netif_ovs_is_port() function instead of open code.
This patch doesn't change logic.

Signed-off-by: Taehee Yoo 
---
 net/openvswitch/dp_notify.c| 2 +-
 net/openvswitch/vport-netdev.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 53cf07d141b4..7af0cde8b293 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -48,7 +48,7 @@ void ovs_dp_notify_wq(struct work_struct *work)
if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL)
continue;
 
-   if (!(vport->dev->priv_flags & 
IFF_OVS_DATAPATH))
+   if (!(netif_is_ovs_port(vport->dev)))
dp_detach_port_notify(vport);
}
}
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 52a1ed9633ec..57d6436e6f6a 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -156,7 +156,7 @@ void ovs_netdev_detach_dev(struct vport *vport)
 static void netdev_destroy(struct vport *vport)
 {
rtnl_lock();
-   if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+   if (netif_is_ovs_port(vport->dev))
ovs_netdev_detach_dev(vport);
rtnl_unlock();
 
@@ -166,7 +166,7 @@ static void netdev_destroy(struct vport *vport)
 void ovs_netdev_tunnel_destroy(struct vport *vport)
 {
rtnl_lock();
-   if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+   if (netif_is_ovs_port(vport->dev))
ovs_netdev_detach_dev(vport);
 
/* We can be invoked by both explicit vport deletion and
@@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
 /* Returns null if this device is not attached to a datapath. */
 struct vport *ovs_netdev_get_vport(struct net_device *dev)
 {
-   if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
+   if (likely(netif_is_ovs_port(dev)))
return (struct vport *)
rcu_dereference_rtnl(dev->rx_handler_data);
else
-- 
2.17.1

[PATCH net-next] net: openvswitch: do not update max_headroom if new headroom is equal to old headroom

2019-07-05 Thread Taehee Yoo

When a vport is deleted, the maximum headroom size would be changed.
If the vport which has the largest headroom is deleted,
the new max_headroom would be set.
But, if the new headroom size is equal to the old headroom size,
updating routine is unnecessary.

Signed-off-by: Taehee Yoo 
---
 net/openvswitch/datapath.c | 39 +++---
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 33b388103741..892287d06c17 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1958,10 +1958,9 @@ static struct vport *lookup_vport(struct net *net,
 
 }
 
-/* Called with ovs_mutex */
-static void update_headroom(struct datapath *dp)
+static unsigned int ovs_get_max_headroom(struct datapath *dp)
 {
-   unsigned dev_headroom, max_headroom = 0;
+   unsigned int dev_headroom, max_headroom = 0;
struct net_device *dev;
struct vport *vport;
int i;
@@ -1975,10 +1974,19 @@ static void update_headroom(struct datapath *dp)
}
}
 
-   dp->max_headroom = max_headroom;
+   return max_headroom;
+}
+
+/* Called with ovs_mutex */
+static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
+{
+   struct vport *vport;
+   int i;
+
+   dp->max_headroom = new_headroom;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
-   netdev_set_rx_headroom(vport->dev, max_headroom);
+   netdev_set_rx_headroom(vport->dev, new_headroom);
 }
 
 static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -1989,6 +1997,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
struct sk_buff *reply;
struct vport *vport;
struct datapath *dp;
+   unsigned int new_headroom;
u32 port_no;
int err;
 
@@ -2050,8 +2059,10 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
  info->snd_portid, info->snd_seq, 0,
  OVS_VPORT_CMD_NEW);
 
-   if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
-   update_headroom(dp);
+   new_headroom = netdev_get_fwd_headroom(vport->dev);
+
+   if (new_headroom > dp->max_headroom)
+   ovs_update_headroom(dp, new_headroom);
else
netdev_set_rx_headroom(vport->dev, dp->max_headroom);
 
@@ -2122,11 +2133,12 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, 
struct genl_info *info)
 
 static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 {
-   bool must_update_headroom = false;
+   bool update_headroom = false;
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct datapath *dp;
struct vport *vport;
+   unsigned int new_headroom;
int err;
 
reply = ovs_vport_cmd_alloc_info();
@@ -2152,12 +2164,17 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, 
struct genl_info *info)
/* the vport deletion may trigger dp headroom update */
dp = vport->dp;
if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
-   must_update_headroom = true;
+   update_headroom = true;
+
netdev_reset_rx_headroom(vport->dev);
ovs_dp_detach_port(vport);
 
-   if (must_update_headroom)
-   update_headroom(dp);
+   if (update_headroom) {
+   new_headroom = ovs_get_max_headroom(dp);
+
+   if (new_headroom < dp->max_headroom)
+   ovs_update_headroom(dp, new_headroom);
+   }
ovs_unlock();
 
ovs_notify(&dp_vport_genl_family, reply, info);
-- 
2.17.1

Re: [PATCH net-next] nfp: tls: fix error return code in nfp_net_tls_add()

2019-07-05 Thread Jakub Kicinski

On Fri, 5 Jul 2019 08:26:25 +, Wei Yongjun wrote:
> Fix to return negative error code -EINVAL from the error handling
> case instead of 0, as done elsewhere in this function.
> 
> Fixes: 1f35a56cf586 ("nfp: tls: add/delete TLS TX connections")
> Signed-off-by: Wei Yongjun 

Acked-by: Jakub Kicinski

RE: [net-next 1/3] ice: Initialize and register platform device to provide RDMA

2019-07-05 Thread Saleem, Shiraz

> Subject: Re: [net-next 1/3] ice: Initialize and register platform device to 
> provide
> RDMA
> 
> On Thu, Jul 04, 2019 at 12:48:29PM +, Jason Gunthorpe wrote:
> > On Thu, Jul 04, 2019 at 02:42:47PM +0200, Greg KH wrote:
> > > On Thu, Jul 04, 2019 at 12:37:33PM +, Jason Gunthorpe wrote:
> > > > On Thu, Jul 04, 2019 at 02:29:50PM +0200, Greg KH wrote:
> > > > > On Thu, Jul 04, 2019 at 12:16:41PM +, Jason Gunthorpe wrote:
> > > > > > On Wed, Jul 03, 2019 at 07:12:50PM -0700, Jeff Kirsher wrote:
> > > > > > > From: Tony Nguyen 
> > > > > > >
> > > > > > > The RDMA block does not advertise on the PCI bus or any other bus.
> > > > > > > Thus the ice driver needs to provide access to the RDMA
> > > > > > > hardware block via a virtual bus; utilize the platform bus to 
> > > > > > > provide this
> access.
> > > > > > >
> > > > > > > This patch initializes the driver to support RDMA as well as
> > > > > > > creates and registers a platform device for the RDMA driver
> > > > > > > to register to. At this point the driver is fully
> > > > > > > initialized to register a platform driver, however, can not
> > > > > > > yet register as the ops have not been implemented.
> > > > > >
> > > > > > I think you need Greg's ack on all this driver stuff -
> > > > > > particularly that a platform_device is OK.
> > > > >
> > > > > A platform_device is almost NEVER ok.
> > > > >
> > > > > Don't abuse it, make a real device on a real bus.  If you don't
> > > > > have a real bus and just need to create a device to hang other
> > > > > things off of, then use the virtual one, that's what it is there for.
> > > >
> > > > Ideally I'd like to see all the RDMA drivers that connect to
> > > > ethernet drivers use some similar scheme.
> > >
> > > Why?  They should be attached to a "real" device, why make any up?
> >
> > ? A "real" device, like struct pci_device, can only bind to one
> > driver. How can we bind it concurrently to net, rdma, scsi, etc?
> 
> MFD was designed for this very problem.
> 
> > > > This is for a PCI device that plugs into multiple subsystems in
> > > > the kernel, ie it has net driver functionality, rdma
> > > > functionality, some even have SCSI functionality
> > >
> > > Sounds like a MFD device, why aren't you using that functionality
> > > instead?
> >
> > This was also my advice, but in another email Jeff says:
> >
> >   MFD architecture was also considered, and we selected the simpler
> >   platform model. Supporting a MFD architecture would require an
> >   additional MFD core driver, individual platform netdev, RDMA function
> >   drivers, and stripping a large portion of the netdev drivers into
> >   MFD core. The sub-devices registered by MFD core for function
> >   drivers are indeed platform devices.
> 
> So, "mfd is too hard, let's abuse a platform device" is ok?
> 
> People have been wanting to do MFD drivers for PCI devices for a long time, 
> it's
> about time someone actually did the work for it, I bet it will not be all 
> that complex
> if tiny embedded drivers can do it :)
> 
Hi Greg - Thanks for your feedback!

We currently have 2 PCI function netdev drivers in the kernel (i40e & ice) that 
support devices (x722 & e810)
which are RDMA capable. Our objective is to add a single unified RDMA driver
(as this a subsystem specific requirement) which needs to access HW resources 
from the
netdev PF drivers. Attaching platform devices from the netdev drivers to the 
platform bus
and having a single RDMA platform driver bind to them and access these 
resources seemed
like a simple approach to realize our objective. But seems like attaching 
platform devices is
wrong. I would like to understand why. 

Are platform sub devices only to be added from an MFD core driver? I am also 
wondering if MFD arch.
would allow for realizing a single RDMA driver and whether we need an MFD core 
driver for
each device, x722 & e810 or whether it can be a single driver.

Shiraz

Re: [PATCH net-next 1/8] Documentation/bindings: net: ocelot: document the PTP bank

2019-07-05 Thread Antoine Tenart

Hi Andrew,

On Fri, Jul 05, 2019 at 04:45:17PM +0200, Andrew Lunn wrote:
> On Fri, Jul 05, 2019 at 03:30:16PM +0200, Antoine Tenart wrote:
> > 
> > I'm not sure about this: optional properties means some parts of the h/w
> > can be missing or not wired. It's not the case here, it's "optional" in
> > the driver only for dt compatibility (so that an older dt blob can work
> > with a newer kernel image), but it's now mandatory in the binding.
> 
> If the driver can work without it, it is clearly optional. You just
> get reduced functionality. That is the thing with DT. You can never
> add more required properties after the first commit without breaking
> backwards compatibility. To make the documentation fit the driver,
> somewhere you need to state they are optional. Either by placing the
> new properties in the optional section of the binding, or add a
> comment.

The documentation is unrelated to the driver. It's the documentation of
the binding itself, which is only describing the h/w.

But I discussed this with a someone and I got to the same conclusion as
your statement, because there can be old dt blobs in the wild and the
binding documentation can be used to make new code. That code should be
aware of required/optional properties.

I'll fix this in v2.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

RE: [rdma 14/16] RDMA/irdma: Add ABI definitions

2019-07-05 Thread Saleem, Shiraz

> Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> 
> On Thu, Jul 04, 2019 at 10:40:21AM +0300, Leon Romanovsky wrote:
> > On Wed, Jul 03, 2019 at 07:12:57PM -0700, Jeff Kirsher wrote:
> > > From: Mustafa Ismail 
> > >
> > > Add ABI definitions for irdma.
> > >
> > > Signed-off-by: Mustafa Ismail 
> > > Signed-off-by: Shiraz Saleem 
> > > include/uapi/rdma/irdma-abi.h | 130
> > > ++
> > >  1 file changed, 130 insertions(+)
> > >  create mode 100644 include/uapi/rdma/irdma-abi.h
> > >
> > > diff --git a/include/uapi/rdma/irdma-abi.h
> > > b/include/uapi/rdma/irdma-abi.h new file mode 100644 index
> > > ..bdfbda4c829e
> > > +++ b/include/uapi/rdma/irdma-abi.h
> > > @@ -0,0 +1,130 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> > > +/* Copyright (c) 2006 - 2019 Intel Corporation.  All rights reserved.
> > > + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> > > + * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> > > + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
> > > + */
> > > +
> > > +#ifndef IRDMA_ABI_H
> > > +#define IRDMA_ABI_H
> > > +
> > > +#include 
> > > +
> > > +/* irdma must support legacy GEN_1 i40iw kernel
> > > + * and user-space whose last ABI ver is 5  */ #define IRDMA_ABI_VER
> > > +6
> >
> > Can you please elaborate about it more?
> > There is no irdma code in RDMA yet, so it makes me wonder why new
> > define shouldn't start from 1.
> 
> It is because they are ABI compatible with the current user space, which 
> raises the
> question why we even have this confusing header file..

It is because we need to support current providers/i40iw user-space.
Our user-space patch series will introduce a new provider (irdma) whose ABI
ver. is also 6 (capable of supporting X722 and which will work with i40iw driver
on older kernels) and removes providers/i40iw from rdma-core.

Re: [PATCH net-next 8/8] net: mscc: PTP Hardware Clock (PHC) support

2019-07-05 Thread Richard Cochran

On Mon, Jul 01, 2019 at 12:03:27PM +0200, Antoine Tenart wrote:

> +void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts)
> +{
> + /* Read current PTP time to get seconds */
> + u32 val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);

This register is protected by ocelot->ptp_clock_lock from other code
paths, but not in this one!

> + val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK | PTP_PIN_CFG_DOM);
> + val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE);
> + ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
> + ts->tv_sec = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB, TOD_ACC_PIN);
...
> +}


> +static int ocelot_init_timestamp(struct ocelot *ocelot)
> +{
> + ocelot->ptp_info = ocelot_ptp_clock_info;
> +
> + ocelot->ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev);
> + if (IS_ERR(ocelot->ptp_clock))
> + return PTR_ERR(ocelot->ptp_clock);

You need to handle the NULL case:

ptp_clock_register() - register a PTP hardware clock driver

@info:   Structure describing the new clock.
@parent: Pointer to the parent device of the new clock.

Returns a valid pointer on success or PTR_ERR on failure.  If PHC
support is missing at the configuration level, this function
returns NULL, and drivers are expected to gracefully handle that
case separately.

> +
> + ocelot_write(ocelot, SYS_PTP_CFG_PTP_STAMP_WID(30), SYS_PTP_CFG);
> + ocelot_write(ocelot, 0x, ANA_TABLES_PTP_ID_LOW);
> + ocelot_write(ocelot, 0x, ANA_TABLES_PTP_ID_HIGH);
> +
> + ocelot_write(ocelot, PTP_CFG_MISC_PTP_EN, PTP_CFG_MISC);
> +
> + return 0;
> +}

Thanks,
Richard

Re: [PATCH bpf-next] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Y Song

On Fri, Jul 5, 2019 at 9:03 AM Quentin Monnet
 wrote:
>
> 2019-07-05 08:42 UTC-0700 ~ Y Song 
> > On Fri, Jul 5, 2019 at 1:21 AM Quentin Monnet
> >  wrote:
> >>
> >> 2019-07-04 22:49 UTC-0700 ~ Y Song 
> >>> On Thu, Jul 4, 2019 at 1:58 AM Quentin Monnet
> >>>  wrote:
> 
>  Add a new "bpftool prog run" subcommand to run a loaded program on input
>  data (and possibly with input context) passed by the user.
> 
>  Print output data (and output context if relevant) into a file or into
>  the console. Print return value and duration for the test run into the
>  console.
> 
>  A "repeat" argument can be passed to run the program several times in a
>  row.
> 
>  The command does not perform any kind of verification based on program
>  type (Is this program type allowed to use an input context?) or on data
>  consistency (Can I work with empty input data?), this is left to the
>  kernel.
> 
>  Example invocation:
> 
>  # perl -e 'print "\x0" x 14' | ./bpftool prog run \
>  pinned /sys/fs/bpf/sample_ret0 \
>  data_in - data_out - repeat 5
>  000         |  ..
>  Return value: 0, duration (average): 260ns
> 
>  When one of data_in or ctx_in is "-", bpftool reads from standard input,
>  in binary format. Other formats (JSON, hexdump) might be supported (via
>  an optional command line keyword like "data_fmt_in") in the future if
>  relevant, but this would require doing more parsing in bpftool.
> 
>  Signed-off-by: Quentin Monnet 
>  Reviewed-by: Jakub Kicinski 
>  ---
> >>
> >> [...]
> >>
>  diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
>  index 9b0db5d14e31..8dcbaa0a8ab1 100644
>  --- a/tools/bpf/bpftool/prog.c
>  +++ b/tools/bpf/bpftool/prog.c
>  @@ -15,6 +15,7 @@
>   #include 
> 
>   #include 
>  +#include 
> 
>   #include 
>   #include 
>  @@ -748,6 +749,344 @@ static int do_detach(int argc, char **argv)
>  return 0;
>   }
> 
>  +static int check_single_stdin(char *file_in, char *other_file_in)
>  +{
>  +   if (file_in && other_file_in &&
>  +   !strcmp(file_in, "-") && !strcmp(other_file_in, "-")) {
>  +   p_err("cannot use standard input for both data_in and 
>  ctx_in");
> >>>
> >>> The error message says data_in and ctx_in.
> >>> Maybe the input parameter should be file_data_in and file_ctx_in?
> >>
> >>
> >> Hi Yonghong,
> >>
> >> It's true those parameters should be file names. But having
> >> "file_data_in", "file_data_out", "file_ctx_in" and "file_ctx_out" on a
> >> command line seems a bit heavy to me? (And relying on keyword prefixing
> >> for typing the command won't help much.)
> >>
> >> My opinion is that it should be clear from the man page or the "help"
> >> command that the parameters are file names. What do you think? I can
> >> prefix all four arguments with "file_" if you believe this is better.
> >
> > I think you misunderstood my question above.
>
> Totally did, sorry :/.
>
> > The command line parameters are fine.
> > I am talking about the function parameter names. Since in the error message,
> > the input parameters are referred for data_in and ctx_in
> >p_err("cannot use standard input for both data_in and ctx_in")
> > maybe the function signature should be
> >   static int check_single_stdin(char *file_data_in, char *file_ctx_in)
> >
> > If you are worried that later on the same function can be used in different
> > contexts, then alternatively, you can have signature like
> >   static int check_single_stdin(char *file_in, char *other_file_in,
> > const char *file_in_arg, const char *other_file_in_arg)
> > where file_in_arg will be passed in as "data_in" and other_file_in_arg
> > as "ctx_in".
> > I think we could delay this until it is really needed.
>
> As a matter of fact, the opposite thing happened. I first used the
> function for data_in/ctx_in, and also for data_out/ctx_out. But I
> changed my mind eventually because there is no real reason not to print
> both data_out and ctx_out to stdout if we want to do so. So I updated
> the name of the parameters in the error messages, but forgot to change
> the arguments for the function. Silly me.
>
> So I totally agree, I'll respin and change the argument names for the
> function. And yes, we could also pass the names to print in the error
> message, but I agree that this is not needed, and not helpful at the moment.
>
> Thanks for catching this!
>
> >>
> >> [...]
> >>
>  +static int do_run(int argc, char **argv)
>  +{
>  +   char *data_fname_in = NULL, *data_fname_out = NULL;
>  +   char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
>  +   struct bpf_prog_test_run_attr test_attr = {0};
>  +   const unsigned int default_size = SZ_32K;
>  +   vo

Re: [PATCH net-next 8/8] net: mscc: PTP Hardware Clock (PHC) support

2019-07-05 Thread Antoine Tenart

Hello Richard,

On Fri, Jul 05, 2019 at 09:47:36AM -0700, Richard Cochran wrote:
> On Mon, Jul 01, 2019 at 12:03:27PM +0200, Antoine Tenart wrote:
> 
> > +void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts)
> > +{
> > +   /* Read current PTP time to get seconds */
> > +   u32 val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
> 
> This register is protected by ocelot->ptp_clock_lock from other code
> paths, but not in this one!

Oops. I'll fix it.

> > +static int ocelot_init_timestamp(struct ocelot *ocelot)
> > +{
> > +   ocelot->ptp_info = ocelot_ptp_clock_info;
> > +
> > +   ocelot->ptp_clock = ptp_clock_register(&ocelot->ptp_info, ocelot->dev);
> > +   if (IS_ERR(ocelot->ptp_clock))
> > +   return PTR_ERR(ocelot->ptp_clock);
> 
> You need to handle the NULL case:

Will do.

> ptp_clock_register() - register a PTP hardware clock driver
> 
> @info:   Structure describing the new clock.
> @parent: Pointer to the parent device of the new clock.
> 
> Returns a valid pointer on success or PTR_ERR on failure.  If PHC
> support is missing at the configuration level, this function
> returns NULL, and drivers are expected to gracefully handle that
> case separately.

Thanks,
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

Re: [rdma 14/16] RDMA/irdma: Add ABI definitions

2019-07-05 Thread Jason Gunthorpe

On Fri, Jul 05, 2019 at 04:42:19PM +, Saleem, Shiraz wrote:
> > Subject: Re: [rdma 14/16] RDMA/irdma: Add ABI definitions
> > 
> > On Thu, Jul 04, 2019 at 10:40:21AM +0300, Leon Romanovsky wrote:
> > > On Wed, Jul 03, 2019 at 07:12:57PM -0700, Jeff Kirsher wrote:
> > > > From: Mustafa Ismail 
> > > >
> > > > Add ABI definitions for irdma.
> > > >
> > > > Signed-off-by: Mustafa Ismail 
> > > > Signed-off-by: Shiraz Saleem 
> > > > include/uapi/rdma/irdma-abi.h | 130
> > > > ++
> > > >  1 file changed, 130 insertions(+)
> > > >  create mode 100644 include/uapi/rdma/irdma-abi.h
> > > >
> > > > diff --git a/include/uapi/rdma/irdma-abi.h
> > > > b/include/uapi/rdma/irdma-abi.h new file mode 100644 index
> > > > ..bdfbda4c829e
> > > > +++ b/include/uapi/rdma/irdma-abi.h
> > > > @@ -0,0 +1,130 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
> > > > +/* Copyright (c) 2006 - 2019 Intel Corporation.  All rights reserved.
> > > > + * Copyright (c) 2005 Topspin Communications.  All rights reserved.
> > > > + * Copyright (c) 2005 Cisco Systems.  All rights reserved.
> > > > + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
> > > > + */
> > > > +
> > > > +#ifndef IRDMA_ABI_H
> > > > +#define IRDMA_ABI_H
> > > > +
> > > > +#include 
> > > > +
> > > > +/* irdma must support legacy GEN_1 i40iw kernel
> > > > + * and user-space whose last ABI ver is 5  */ #define IRDMA_ABI_VER
> > > > +6
> > >
> > > Can you please elaborate about it more?
> > > There is no irdma code in RDMA yet, so it makes me wonder why new
> > > define shouldn't start from 1.
> > 
> > It is because they are ABI compatible with the current user space, which 
> > raises the
> > question why we even have this confusing header file..
> 
> It is because we need to support current providers/i40iw user-space.
> Our user-space patch series will introduce a new provider (irdma) whose ABI
> ver. is also 6 (capable of supporting X722 and which will work with i40iw 
> driver
> on older kernels) and removes providers/i40iw from rdma-core.

Why on earth would we do that?

Jason

Re: [PATCHv2] tools bpftool: Fix json dump crash on powerpc

2019-07-05 Thread Jakub Kicinski

On Fri, 5 Jul 2019 14:10:31 +0200, Jiri Olsa wrote:
> Michael reported crash with by bpf program in json mode on powerpc:
> 
>   # bpftool prog -p dump jited id 14
>   [{
> "name": "0xda9aa760",
> "insns": [{
> "pc": "0x0",
> "operation": "nop",
> "operands": [null
> ]
> },{
> "pc": "0x4",
> "operation": "nop",
> "operands": [null
> ]
> },{
> "pc": "0x8",
> "operation": "mflr",
>   Segmentation fault (core dumped)
> 
> The code is assuming char pointers in format, which is not always
> true at least for powerpc. Fixing this by dumping the whole string
> into buffer based on its format.
> 
> Please note that libopcodes code does not check return values from
> fprintf callback, but as per Jakub suggestion returning -1 on allocation
> failure so we do the best effort to propagate the error. 
> 
> Reported-by: Michael Petlan 
> Signed-off-by: Jiri Olsa 

Thanks, let me repost all the tags (Quentin, please shout if you're
not ok with this :)):

Fixes: 107f041212c1 ("tools: bpftool: add JSON output for `bpftool prog dump 
jited *` command")
Reviewed-by: Quentin Monnet 
Reviewed-by: Jakub Kicinski

Re: [PATCHv2] tools bpftool: Fix json dump crash on powerpc

2019-07-05 Thread Quentin Monnet

2019-07-05 10:24 UTC-0700 ~ Jakub Kicinski 
> On Fri, 5 Jul 2019 14:10:31 +0200, Jiri Olsa wrote:
>> Michael reported crash with by bpf program in json mode on powerpc:
>>
>>   # bpftool prog -p dump jited id 14
>>   [{
>> "name": "0xda9aa760",
>> "insns": [{
>> "pc": "0x0",
>> "operation": "nop",
>> "operands": [null
>> ]
>> },{
>> "pc": "0x4",
>> "operation": "nop",
>> "operands": [null
>> ]
>> },{
>> "pc": "0x8",
>> "operation": "mflr",
>>   Segmentation fault (core dumped)
>>
>> The code is assuming char pointers in format, which is not always
>> true at least for powerpc. Fixing this by dumping the whole string
>> into buffer based on its format.
>>
>> Please note that libopcodes code does not check return values from
>> fprintf callback, but as per Jakub suggestion returning -1 on allocation
>> failure so we do the best effort to propagate the error. 
>>
>> Reported-by: Michael Petlan 
>> Signed-off-by: Jiri Olsa 
> 
> Thanks, let me repost all the tags (Quentin, please shout if you're
> not ok with this :)):

I confirm it's all good for me, thanks :)

> 
> Fixes: 107f041212c1 ("tools: bpftool: add JSON output for `bpftool prog dump 
> jited *` command")
> Reviewed-by: Quentin Monnet 
> Reviewed-by: Jakub Kicinski 
>

Re: loss of connectivity after enabling vlan_filtering

2019-07-05 Thread Vivien Didelot

On Sun, 30 Jun 2019 01:23:02 +0200, vto...@googlemail.com wrote:
> A simple soul might infer that mv88e6xxx includes MV88E6060, at least
> that happened to me apparently (being said simpleton).

I agree that is confusing, that is why I don't like the 'xxx' naming
convention in general, found in many drivers. I'd prefer to stick with a
reference model, or product category, like soho in this case. But it was
initially written like this, so no reason to change its name now. I still
plan to merge mv88e6060 into mv88e6xxx, but it is unfortunately low priority
because I still don't have a platform with a 88E6060 on it.

Thanks,

Vivien

Re: NEIGH: BUG, double timer add, state is 8

2019-07-05 Thread Lorenzo Bianconi

On Jul 05, David Ahern wrote:
> On 7/4/19 3:59 PM, Marek Majkowski wrote:
> > I found a way to hit an obscure BUG in the
> > net/core/neighbour.c:neigh_add_timer(), by piping two carefully
> > crafted messages into AF_NETLINK socket.
> > 
> > https://github.com/torvalds/linux/blob/v5.2-rc7/net/core/neighbour.c#L259
> > 
> > if (unlikely(mod_timer(&n->timer, when))) {
> > printk("NEIGH: BUG, double timer add, state is %x\n", n->nud_state);
> > dump_stack();
> >  }
> > 
> > The repro is here:
> > https://gist.github.com/majek/d70297b9d72bc2e2b82145e122722a0c
> > 
> > wget 
> > https://gist.githubusercontent.com/majek/d70297b9d72bc2e2b82145e122722a0c/raw/9e140bcedecc28d722022f1da142a379a9b7a7b0/double_timer_add_bug.c
> 
> Thanks for the report - and the reproducer. I am on PTO through Monday;
> I will take a look next week if no one else does.

Hi David and Marek,

looking at the reproducer it seems to me the issue is due to the use of
'NTF_USE' from userspace.
Should we unschedule the neigh timer if we are in IN_TIMER receiving this
flag from userspace? (taking appropriate locking)

Regards,
Lorenzo


signature.asc
Description: PGP signature

Re: [PATCH rdma-next 0/2] DEVX VHCA tunnel support

2019-07-05 Thread Jason Gunthorpe

On Mon, Jul 01, 2019 at 09:14:00PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> Hi,
> 
> Those two patches introduce VHCA tunnel mechanism to DEVX interface
> needed for Bluefield SOC. See extensive commit messages for more
> information.
> 
> Thanks
> 
> Max Gurtovoy (2):
>   net/mlx5: Introduce VHCA tunnel device capability
>   IB/mlx5: Implement VHCA tunnel mechanism in DEVX
> 
>  drivers/infiniband/hw/mlx5/devx.c | 24 
>  include/linux/mlx5/mlx5_ifc.h | 10 --
>  2 files changed, 28 insertions(+), 6 deletions(-)

This looks Ok can you apply the mlx5-next patch please

Thanks,
Jason

Re: [PATCH bpf-next] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Quentin Monnet

2019-07-05 10:08 UTC-0700 ~ Y Song 
> On Fri, Jul 5, 2019 at 9:03 AM Quentin Monnet
>  wrote:
>>
>> 2019-07-05 08:42 UTC-0700 ~ Y Song 
>>> On Fri, Jul 5, 2019 at 1:21 AM Quentin Monnet
>>>  wrote:

 2019-07-04 22:49 UTC-0700 ~ Y Song 
> On Thu, Jul 4, 2019 at 1:58 AM Quentin Monnet
>  wrote:
>>
>> Add a new "bpftool prog run" subcommand to run a loaded program on input
>> data (and possibly with input context) passed by the user.
>>
>> Print output data (and output context if relevant) into a file or into
>> the console. Print return value and duration for the test run into the
>> console.
>>
>> A "repeat" argument can be passed to run the program several times in a
>> row.
>>
>> The command does not perform any kind of verification based on program
>> type (Is this program type allowed to use an input context?) or on data
>> consistency (Can I work with empty input data?), this is left to the
>> kernel.
>>
>> Example invocation:
>>
>> # perl -e 'print "\x0" x 14' | ./bpftool prog run \
>> pinned /sys/fs/bpf/sample_ret0 \
>> data_in - data_out - repeat 5
>> 000         |  ..
>> Return value: 0, duration (average): 260ns
>>
>> When one of data_in or ctx_in is "-", bpftool reads from standard input,
>> in binary format. Other formats (JSON, hexdump) might be supported (via
>> an optional command line keyword like "data_fmt_in") in the future if
>> relevant, but this would require doing more parsing in bpftool.
>>
>> Signed-off-by: Quentin Monnet 
>> Reviewed-by: Jakub Kicinski 
>> ---

 [...]

>> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
>> index 9b0db5d14e31..8dcbaa0a8ab1 100644
>> --- a/tools/bpf/bpftool/prog.c
>> +++ b/tools/bpf/bpftool/prog.c
>> @@ -15,6 +15,7 @@
>>  #include 
>>
>>  #include 
>> +#include 
>>
>>  #include 
>>  #include 
>> @@ -748,6 +749,344 @@ static int do_detach(int argc, char **argv)
>> return 0;
>>  }
>>
>> +static int check_single_stdin(char *file_in, char *other_file_in)
>> +{
>> +   if (file_in && other_file_in &&
>> +   !strcmp(file_in, "-") && !strcmp(other_file_in, "-")) {
>> +   p_err("cannot use standard input for both data_in and 
>> ctx_in");
>
> The error message says data_in and ctx_in.
> Maybe the input parameter should be file_data_in and file_ctx_in?


 Hi Yonghong,

 It's true those parameters should be file names. But having
 "file_data_in", "file_data_out", "file_ctx_in" and "file_ctx_out" on a
 command line seems a bit heavy to me? (And relying on keyword prefixing
 for typing the command won't help much.)

 My opinion is that it should be clear from the man page or the "help"
 command that the parameters are file names. What do you think? I can
 prefix all four arguments with "file_" if you believe this is better.
>>>
>>> I think you misunderstood my question above.
>>
>> Totally did, sorry :/.
>>
>>> The command line parameters are fine.
>>> I am talking about the function parameter names. Since in the error message,
>>> the input parameters are referred for data_in and ctx_in
>>>p_err("cannot use standard input for both data_in and ctx_in")
>>> maybe the function signature should be
>>>   static int check_single_stdin(char *file_data_in, char *file_ctx_in)
>>>
>>> If you are worried that later on the same function can be used in different
>>> contexts, then alternatively, you can have signature like
>>>   static int check_single_stdin(char *file_in, char *other_file_in,
>>> const char *file_in_arg, const char *other_file_in_arg)
>>> where file_in_arg will be passed in as "data_in" and other_file_in_arg
>>> as "ctx_in".
>>> I think we could delay this until it is really needed.
>>
>> As a matter of fact, the opposite thing happened. I first used the
>> function for data_in/ctx_in, and also for data_out/ctx_out. But I
>> changed my mind eventually because there is no real reason not to print
>> both data_out and ctx_out to stdout if we want to do so. So I updated
>> the name of the parameters in the error messages, but forgot to change
>> the arguments for the function. Silly me.
>>
>> So I totally agree, I'll respin and change the argument names for the
>> function. And yes, we could also pass the names to print in the error
>> message, but I agree that this is not needed, and not helpful at the moment.
>>
>> Thanks for catching this!
>>

 [...]

>> +static int do_run(int argc, char **argv)
>> +{
>> +   char *data_fname_in = NULL, *data_fname_out = NULL;
>> +   char *ctx_fname_in = NULL, *ctx_fname_out = NULL;
>> +   struct bpf_prog_test_run_attr test_attr = {0};
>> +

[PATCH bpf-next v2] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Quentin Monnet

Add a new "bpftool prog run" subcommand to run a loaded program on input
data (and possibly with input context) passed by the user.

Print output data (and output context if relevant) into a file or into
the console. Print return value and duration for the test run into the
console.

A "repeat" argument can be passed to run the program several times in a
row.

The command does not perform any kind of verification based on program
type (Is this program type allowed to use an input context?) or on data
consistency (Can I work with empty input data?), this is left to the
kernel.

Example invocation:

# perl -e 'print "\x0" x 14' | ./bpftool prog run \
pinned /sys/fs/bpf/sample_ret0 \
data_in - data_out - repeat 5
000         |  ..
Return value: 0, duration (average): 260ns

When one of data_in or ctx_in is "-", bpftool reads from standard input,
in binary format. Other formats (JSON, hexdump) might be supported (via
an optional command line keyword like "data_fmt_in") in the future if
relevant, but this would require doing more parsing in bpftool.

v2:
- Fix argument names for function check_single_stdin(). (Yonghong)

Signed-off-by: Quentin Monnet 
Reviewed-by: Jakub Kicinski 
---
 .../bpftool/Documentation/bpftool-prog.rst|  34 ++
 tools/bpf/bpftool/bash-completion/bpftool |  28 +-
 tools/bpf/bpftool/main.c  |  29 ++
 tools/bpf/bpftool/main.h  |   1 +
 tools/bpf/bpftool/prog.c  | 348 +-
 tools/include/linux/sizes.h   |  48 +++
 6 files changed, 485 insertions(+), 3 deletions(-)
 create mode 100644 tools/include/linux/sizes.h

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 1df637f85f94..7a374b3c851d 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -29,6 +29,7 @@ PROG COMMANDS
 |  **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
 |  **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
 |  **bpftool** **prog tracelog**
+|  **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* 
[**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* 
[**ctx_size_out** *M*]]] [**repeat** *N*]
 |  **bpftool** **prog help**
 |
 |  *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -146,6 +147,39 @@ DESCRIPTION
  streaming data from BPF programs to user space, one can use
  perf events (see also **bpftool-map**\ (8)).
 
+   **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* 
[**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* 
[**ctx_size_out** *M*]]] [**repeat** *N*]
+ Run BPF program *PROG* in the kernel testing infrastructure
+ for BPF, meaning that the program works on the data and
+ context provided by the user, and not on actual packets or
+ monitored functions etc. Return value and duration for the
+ test run are printed out to the console.
+
+ Input data is read from the *FILE* passed with **data_in**.
+ If this *FILE* is "**-**", input data is read from standard
+ input. Input context, if any, is read from *FILE* passed with
+ **ctx_in**. Again, "**-**" can be used to read from standard
+ input, but only if standard input is not already in use for
+ input data. If a *FILE* is passed with **data_out**, output
+ data is written to that file. Similarly, output context is
+ written to the *FILE* passed with **ctx_out**. For both
+ output flows, "**-**" can be used to print to the standard
+ output (as plain text, or JSON if relevant option was
+ passed). If output keywords are omitted, output data and
+ context are discarded. Keywords **data_size_out** and
+ **ctx_size_out** are used to pass the size (in bytes) for the
+ output buffers to the kernel, although the default of 32 kB
+ should be more than enough for most cases.
+
+ Keyword **repeat** is used to indicate the number of
+ consecutive runs to perform. Note that output data and
+ context printed to files correspond to the last of those
+ runs. The duration printed out at the end of the runs is an
+ average over all runs performed by the command.
+
+ Not all program types support test run. Among those which do,
+ not all of them can take the **ctx_in**/**ctx_out**
+ arguments. bpftool does not perform checks on program types.
+
**bpftool prog help**
  Print sh

[PATCH bpf-next 1/3] include/bpf.h: Remove map_insert_ctx() stubs

2019-07-05 Thread Toke Høiland-Jørgensen

From: Toke Høiland-Jørgensen 

When we changed the device and CPU maps to use linked lists instead of
bitmaps, we also removed the need for the map_insert_ctx() helpers to keep
track of the bitmaps inside each map. However, it seems I forgot to remove
the function definitions stubs, so remove those here.

Signed-off-by: Toke Høiland-Jørgensen 
---
 include/linux/bpf.h |   10 --
 1 file changed, 10 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 18f4cc2c6acd..bfdb54dd2ad1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -713,7 +713,6 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
@@ -721,7 +720,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, 
struct sk_buff *skb,
 struct bpf_prog *xdp_prog);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
@@ -801,10 +799,6 @@ static inline struct net_device  
*__dev_map_lookup_elem(struct bpf_map *map,
return NULL;
 }
 
-static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
-{
-}
-
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
@@ -834,10 +828,6 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct 
bpf_map *map, u32 key)
return NULL;
 }
 
-static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
-{
-}
-
 static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }

[PATCH bpf-next 2/3] xdp: Refactor devmap allocation code for reuse

2019-07-05 Thread Toke Høiland-Jørgensen

From: Toke Høiland-Jørgensen 

The subsequent patch to add a new devmap sub-type can re-use much of the
initialisation and allocation code, so refactor it into separate functions.

Signed-off-by: Toke Høiland-Jørgensen 
---
 kernel/bpf/devmap.c |  137 +++
 1 file changed, 84 insertions(+), 53 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d83cf8ccc872..a2fe16362129 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -60,7 +60,7 @@ struct xdp_bulk_queue {
 struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
-   unsigned int bit;
+   unsigned int idx; /* keep track of map index for tracepoint */
struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
 };
@@ -75,28 +75,22 @@ struct bpf_dtab {
 static DEFINE_SPINLOCK(dev_map_lock);
 static LIST_HEAD(dev_map_list);
 
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr,
+   bool check_memlock)
 {
-   struct bpf_dtab *dtab;
int err, cpu;
u64 cost;
 
-   if (!capable(CAP_NET_ADMIN))
-   return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
-   return ERR_PTR(-EINVAL);
+   return -EINVAL;
 
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
 * verifier prevents writes from the BPF side
 */
attr->map_flags |= BPF_F_RDONLY_PROG;
 
-   dtab = kzalloc(sizeof(*dtab), GFP_USER);
-   if (!dtab)
-   return ERR_PTR(-ENOMEM);
 
bpf_map_init_from_attr(&dtab->map, attr);
 
@@ -107,9 +101,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
-   goto free_dtab;
-
-   err = -ENOMEM;
+   return -EINVAL;
 
dtab->flush_list = alloc_percpu(struct list_head);
if (!dtab->flush_list)
@@ -124,19 +116,38 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab->netdev_map)
goto free_percpu;
 
-   spin_lock(&dev_map_lock);
-   list_add_tail_rcu(&dtab->list, &dev_map_list);
-   spin_unlock(&dev_map_lock);
-
-   return &dtab->map;
+   return 0;
 
 free_percpu:
free_percpu(dtab->flush_list);
 free_charge:
bpf_map_charge_finish(&dtab->map.memory);
-free_dtab:
-   kfree(dtab);
-   return ERR_PTR(err);
+   return -ENOMEM;
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+   struct bpf_dtab *dtab;
+   int err;
+
+   if (!capable(CAP_NET_ADMIN))
+   return ERR_PTR(-EPERM);
+
+   dtab = kzalloc(sizeof(*dtab), GFP_USER);
+   if (!dtab)
+   return ERR_PTR(-ENOMEM);
+
+   err = dev_map_init_map(dtab, attr, true);
+   if (err) {
+   kfree(dtab);
+   return ERR_PTR(err);
+   }
+
+   spin_lock(&dev_map_lock);
+   list_add_tail_rcu(&dtab->list, &dev_map_list);
+   spin_unlock(&dev_map_lock);
+
+   return &dtab->map;
 }
 
 static void dev_map_free(struct bpf_map *map)
@@ -235,7 +246,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
 out:
bq->count = 0;
 
-   trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+   trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
  sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
@@ -412,17 +423,52 @@ static int dev_map_delete_elem(struct bpf_map *map, void 
*key)
return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-   u64 map_flags)
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+   struct bpf_dtab *dtab,
+   u32 ifindex,
+   unsigned int idx)
 {
-   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-   struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+   struct bpf_dtab_netdev *dev;
+   struct xdp_bulk_queue *bq;
+   int cpu;
+
+   dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+   if (!dev)
+   return ERR_PTR(-ENOMEM);
+
+   dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+   sizeof(void *), gfp);
+   if (!dev->bulkq) {
+   kfree(dev);
+   return ERR_PTR(-ENOMEM);

[PATCH bpf-next 0/3] xdp: Add devmap_hash map type

2019-07-05 Thread Toke Høiland-Jørgensen

This series adds a new map type, devmap_hash, that works like the existing
devmap type, but using a hash-based indexing scheme. This is useful for the use
case where a devmap is indexed by ifindex (for instance for use with the routing
table lookup helper). For this use case, the regular devmap needs to be sized
after the maximum ifindex number, not the number of devices in it. A hash-based
indexing scheme makes it possible to size the map after the number of devices it
should contain instead.

This was previously part of my patch series that also turned the regular
bpf_redirect() helper into a map-based one; for this series I just pulled out
the patches that introduced the new map type.

Changelog:

Changes to these patches since the previous series:

- Rebase on top of the other devmap changes (makes this one simpler!)
- Don't enforce key==val, but allow arbitrary indexes.
- Rename the type to devmap_hash to reflect the fact that it's just a hashmap 
now.

---

Toke Høiland-Jørgensen (3):
  include/bpf.h: Remove map_insert_ctx() stubs
  xdp: Refactor devmap allocation code for reuse
  xdp: Add devmap_hash map type for looking up devices by hashed index


 include/linux/bpf.h |   11 -
 include/linux/bpf_types.h   |1 
 include/trace/events/xdp.h  |3 
 include/uapi/linux/bpf.h|7 -
 kernel/bpf/devmap.c |  325 ++-
 kernel/bpf/verifier.c   |2 
 net/core/filter.c   |9 +
 tools/bpf/bpftool/map.c |1 
 tools/include/uapi/linux/bpf.h  |7 -
 tools/lib/bpf/libbpf_probes.c   |1 
 tools/testing/selftests/bpf/test_maps.c |   16 ++
 11 files changed, 316 insertions(+), 67 deletions(-)

[PATCH bpf-next 3/3] xdp: Add devmap_hash map type for looking up devices by hashed index

2019-07-05 Thread Toke Høiland-Jørgensen

From: Toke Høiland-Jørgensen 

A common pattern when using xdp_redirect_map() is to create a device map
where the lookup key is simply ifindex. Because device maps are arrays,
this leaves holes in the map, and the map has to be sized to fit the
largest ifindex, regardless of how many devices actually are actually
needed in the map.

This patch adds a second type of device map where the key is looked up
using a hashmap, instead of being used as an array index. This allows maps
to be densely packed, so they can be smaller.

Signed-off-by: Toke Høiland-Jørgensen 
---
 include/linux/bpf.h |7 +
 include/linux/bpf_types.h   |1 
 include/trace/events/xdp.h  |3 
 include/uapi/linux/bpf.h|7 +
 kernel/bpf/devmap.c |  192 +++
 kernel/bpf/verifier.c   |2 
 net/core/filter.c   |9 +
 tools/bpf/bpftool/map.c |1 
 tools/include/uapi/linux/bpf.h  |7 +
 tools/lib/bpf/libbpf_probes.c   |1 
 tools/testing/selftests/bpf/test_maps.c |   16 +++
 11 files changed, 237 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bfdb54dd2ad1..f9a506147c8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -713,6 +713,7 @@ struct xdp_buff;
 struct sk_buff;
 
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 
key);
 void __dev_map_flush(struct bpf_map *map);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
@@ -799,6 +800,12 @@ static inline struct net_device  
*__dev_map_lookup_elem(struct bpf_map *map,
return NULL;
 }
 
+static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map 
*map,
+u32 key)
+{
+   return NULL;
+}
+
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index eec5aeeeaf92..36a9c2325176 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -62,6 +62,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, 
array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 68899fdc985b..8c8420230a10 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -175,7 +175,8 @@ struct _bpf_dtab_netdev {
 #endif /* __DEVMAP_OBJ_TYPE */
 
 #define devmap_ifindex(fwd, map)   \
-   ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?   \
+   ((map->map_type == BPF_MAP_TYPE_DEVMAP ||   \
+ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ?  \
  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)   \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ead27aebf491..05ce55dd366a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -134,6 +134,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_QUEUE,
BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
+   BPF_MAP_TYPE_DEVMAP_HASH,
 };
 
 /* Note that tracing related programs such as
@@ -879,14 +880,14 @@ union bpf_attr {
  *
  * int ret;
  * struct bpf_tunnel_key key = {};
- * 
+ *
  * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
  * if (ret < 0)
  * return TC_ACT_SHOT; // drop packet
- * 
+ *
  * if (key.remote_ipv4 != 0x0a01)
  * return TC_ACT_SHOT; // drop packet
- * 
+ *
  * return TC_ACT_OK;   // accept packet
  *
  * This interface can also be used with all encapsulation devices
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a2fe16362129..341af02f049d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@
  * notifier hook walks the map we know that new dev references can not be
  * added by the user because core infrastructure ensures dev_get_by_index()
  * calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to 
be
+ * densely packed instead of having holes in the looku

Re: [PATCH bpf-next v2] tools: bpftool: add "prog run" subcommand to test-run programs

2019-07-05 Thread Y Song

On Fri, Jul 5, 2019 at 10:54 AM Quentin Monnet
 wrote:
>
> Add a new "bpftool prog run" subcommand to run a loaded program on input
> data (and possibly with input context) passed by the user.
>
> Print output data (and output context if relevant) into a file or into
> the console. Print return value and duration for the test run into the
> console.
>
> A "repeat" argument can be passed to run the program several times in a
> row.
>
> The command does not perform any kind of verification based on program
> type (Is this program type allowed to use an input context?) or on data
> consistency (Can I work with empty input data?), this is left to the
> kernel.
>
> Example invocation:
>
> # perl -e 'print "\x0" x 14' | ./bpftool prog run \
> pinned /sys/fs/bpf/sample_ret0 \
> data_in - data_out - repeat 5
> 000         |  ..
> Return value: 0, duration (average): 260ns
>
> When one of data_in or ctx_in is "-", bpftool reads from standard input,
> in binary format. Other formats (JSON, hexdump) might be supported (via
> an optional command line keyword like "data_fmt_in") in the future if
> relevant, but this would require doing more parsing in bpftool.
>
> v2:
> - Fix argument names for function check_single_stdin(). (Yonghong)
>
> Signed-off-by: Quentin Monnet 
> Reviewed-by: Jakub Kicinski 

Acked-by: Yonghong Song

[net-next] net: fib_rules: do not flow dissect local packets

2019-07-05 Thread Petar Penkov

Rules matching on loopback iif do not need early flow dissection as the
packet originates from the host. Stop counting such rules in
fib_rule_requires_fldissect

Signed-off-by: Petar Penkov 
---
 include/net/fib_rules.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index eba8465e1d86..20dcadd8eed9 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -180,9 +180,9 @@ static inline bool fib_rule_port_range_compare(struct 
fib_rule_port_range *a,
 
 static inline bool fib_rule_requires_fldissect(struct fib_rule *rule)
 {
-   return rule->ip_proto ||
+   return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto ||
fib_rule_port_range_set(&rule->sport_range) ||
-   fib_rule_port_range_set(&rule->dport_range);
+   fib_rule_port_range_set(&rule->dport_range));
 }
 
 struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *,
-- 
2.22.0.410.gd8fdbe21b5-goog

[PATCH] coallocate socket_wq with socket itself

2019-07-05 Thread Al Viro

socket->wq is assign-once, set when we are initializing both
struct socket it's in and struct socket_wq it points to.  As the
matter of fact, the only reason for separate allocation was the
ability to RCU-delay freeing of socket_wq.  RCU-delaying the
freeing of socket itself gets rid of that need, so we can just
fold struct socket_wq into the end of struct socket and simplify
the life both for sock_alloc_inode() (one allocation instead of
two) and for tun/tap oddballs, where we used to embed struct socket
and struct socket_wq into the same structure (now - embedding just
the struct socket).

Note that reference to struct socket_wq in struct sock does remain
a reference - that's unchanged.

Signed-off-by: Al Viro 
---
 drivers/net/tap.c  |  5 ++---
 drivers/net/tun.c  |  8 +++-
 include/linux/if_tap.h |  1 -
 include/linux/net.h|  4 ++--
 include/net/sock.h |  4 ++--
 net/core/sock.c|  2 +-
 net/socket.c   | 19 +--
 7 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 8e01390c738e..dd614c2cd994 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -520,8 +520,7 @@ static int tap_open(struct inode *inode, struct file *file)
goto err;
}
 
-   RCU_INIT_POINTER(q->sock.wq, &q->wq);
-   init_waitqueue_head(&q->wq.wait);
+   init_waitqueue_head(&q->sock.wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -579,7 +578,7 @@ static __poll_t tap_poll(struct file *file, poll_table 
*wait)
goto out;
 
mask = 0;
-   poll_wait(file, &q->wq.wait, wait);
+   poll_wait(file, &q->sock.wq.wait, wait);
 
if (!ptr_ring_empty(&q->ring))
mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d7c55e0fa8f4..3d443597bd04 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -160,7 +160,6 @@ struct tun_pcpu_stats {
 struct tun_file {
struct sock sk;
struct socket socket;
-   struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -2165,7 +2164,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int 
noblock, int *err)
goto out;
}
 
-   add_wait_queue(&tfile->wq.wait, &wait);
+   add_wait_queue(&tfile->socket.wq.wait, &wait);
 
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -2185,7 +2184,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int 
noblock, int *err)
}
 
__set_current_state(TASK_RUNNING);
-   remove_wait_queue(&tfile->wq.wait, &wait);
+   remove_wait_queue(&tfile->socket.wq.wait, &wait);
 
 out:
*err = error;
@@ -3415,8 +3414,7 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
tfile->flags = 0;
tfile->ifindex = 0;
 
-   init_waitqueue_head(&tfile->wq.wait);
-   RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
+   init_waitqueue_head(&tfile->socket.wq.wait);
 
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 8e66866c11be..915a187cfabd 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -62,7 +62,6 @@ struct tap_dev {
 struct tap_queue {
struct sock sk;
struct socket sock;
-   struct socket_wq wq;
int vnet_hdr_sz;
struct tap_dev __rcu *tap;
struct file *file;
diff --git a/include/linux/net.h b/include/linux/net.h
index f7d672cf25b5..9cafb5f353a9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -116,11 +116,11 @@ struct socket {
 
unsigned long   flags;
 
-   struct socket_wq*wq;
-
struct file *file;
struct sock *sk;
const struct proto_ops  *ops;
+
+   struct socket_wqwq;
 };
 
 struct vm_area_struct;
diff --git a/include/net/sock.h b/include/net/sock.h
index 6cbc16136357..228db3998e46 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1822,7 +1822,7 @@ static inline void sock_graft(struct sock *sk, struct 
socket *parent)
 {
WARN_ON(parent->sk);
write_lock_bh(&sk->sk_callback_lock);
-   rcu_assign_pointer(sk->sk_wq, parent->wq);
+   rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2100,7 +2100,7 @@ static inline void sock_poll_wait(struct file *filp, 
struct socket *sock,
  poll_table *p)
 {
if (!poll_does_not_wait(p)) {
-   poll_wait(filp, &sock->wq->wait, p);
+   poll_wait(filp, &sock->wq.wait, p);
/* We need to be sure we are in sync with the
 * socket flags modification.
 *
diff -

[PATCH] sockfs: switch to ->free_inode()

2019-07-05 Thread Al Viro

we do have an RCU-delayed part there already (freeing the wq),
so it's not like the pipe situation; moreover, it might be
worth considering coallocating wq with the rest of struct sock_alloc.
->sk_wq in struct sock would remain a pointer as it is, but
the object it normally points to would be coallocated with
struct socket...

Signed-off-by: Al Viro 
---
 net/socket.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index d97b74f762e8..541719a2443d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -258,12 +258,12 @@ static struct inode *sock_alloc_inode(struct super_block 
*sb)
return &ei->vfs_inode;
 }
 
-static void sock_destroy_inode(struct inode *inode)
+static void sock_free_inode(struct inode *inode)
 {
struct socket_alloc *ei;
 
ei = container_of(inode, struct socket_alloc, vfs_inode);
-   kfree_rcu(ei->socket.wq, rcu);
+   kfree(ei->socket.wq);
kmem_cache_free(sock_inode_cachep, ei);
 }
 
@@ -288,7 +288,7 @@ static void init_inodecache(void)
 
 static const struct super_operations sockfs_ops = {
.alloc_inode= sock_alloc_inode,
-   .destroy_inode  = sock_destroy_inode,
+   .free_inode = sock_free_inode,
.statfs = simple_statfs,
 };
 
-- 
2.11.0

Re: [PATCH net-next v2 1/3] devlink: Introduce PCI PF port flavour and port attribute

2019-07-05 Thread Jakub Kicinski

On Fri,  5 Jul 2019 02:37:09 -0500, Parav Pandit wrote:
> @@ -38,14 +38,24 @@ struct devlink {
>   char priv[0] __aligned(NETDEV_ALIGN);
>  };
>  
> +struct devlink_port_pci_pf_attrs {
> + u16 pf; /* Associated PCI PF for this port. */
> +};
> +
>  struct devlink_port_attrs {
>   u8 set:1,
>  split:1,
>  switch_port:1;
>   enum devlink_port_flavour flavour;
> - u32 port_number; /* same value as "split group" */
> + u32 port_number; /* same value as "split group".
> +   * Valid only when a port is physical and visible
> +   * to the user for a given port flavour.
> +   */

port_number can be in the per-flavour union below.

>   u32 split_subport_number;

As can split_subport_number.

>   struct netdev_phys_item_id switch_id;
> + union {
> + struct devlink_port_pci_pf_attrs pci_pf;
> + };
>  };
>  
>  struct devlink_port {

> @@ -515,8 +523,14 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
>   return 0;
>   if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
>   return -EMSGSIZE;
> - if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
> + if (is_devlink_phy_port_num_supported(devlink_port) &&
> + nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
>   return -EMSGSIZE;
> + if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
> + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
> + attrs->pci_pf.pf))
> + return -EMSGSIZE;
> + }
>   if (!attrs->split)
>   return 0;
>   if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))

Split attributes as well, please:

On Tue, 2 Jul 2019 16:42:52 -0700, Jakub Kicinski wrote:
> port_number, and split attributes should not be exposed for PCI ports.

We have no clear semantics for those, yet, and the phys_port_name
implementation in this patch doesn't handle split PCI, so let's leave
them out for now.

Re: [EXT] Re: [PATCH net-next v2 4/4] qed*: Add devlink support for configuration attributes.

2019-07-05 Thread Jakub Kicinski

On Fri, 5 Jul 2019 08:22:41 +, Sudarsana Reddy Kalluru wrote:
> > On Thu, 4 Jul 2019 06:20:11 -0700, Sudarsana Reddy Kalluru wrote:  
> > > This patch adds implementation for devlink callbacks for reading and
> > > configuring the device attributes.
> > >
> > > Signed-off-by: Sudarsana Reddy Kalluru 
> > > Signed-off-by: Ariel Elior 

> > > diff --git a/Documentation/networking/devlink-params-qede.txt
> > > b/Documentation/networking/devlink-params-qede.txt
> > > new file mode 100644
> > > index 000..f78a993
> > > --- /dev/null
> > > +++ b/Documentation/networking/devlink-params-qede.txt
> > > @@ -0,0 +1,72 @@
> > > +enable_sriov [DEVICE, GENERIC]
> > > + Configuration mode: Permanent
> > > +
> > > +iwarp_cmt[DEVICE, DRIVER-SPECIFIC]
> > > + Enable iWARP support over 100G device (CMT mode).  
> > > + Type: Boolean
> > > + Configuration mode: runtime
> > > +
> > > +entity_id[DEVICE, DRIVER-SPECIFIC]
> > > + Set the entity ID value to be used for this device
> > > + while reading/configuring the devlink attributes.
> > > + Type: u8
> > > + Configuration mode: runtime  
> > 
> > Can you explain what this is?  
>
> Hardware/mfw provides the option to modify/read the config of other
> PFs. A non-zero entity id represents a partition number (or simply a
> PF-id) for which the config need to be read/updated.

Having a parameter which changes the interpretation of other parameters
makes me quite uncomfortable :(  Could it be a better idea, perhaps, to
use PCI ports?  We have been discussing PCI ports for a while now, and
they will probably become a reality soon.  You could then hang the
per-PF parameters off of the PF ports rather than the device instance? 

> > > +device_capabilities  [DEVICE, DRIVER-SPECIFIC]
> > > + Set the entity ID value to be used for this device
> > > + while reading/configuring the devlink attributes.
> > > + Type: u8
> > > + Configuration mode: runtime  
> > 
> > Looks like you copied the previous text here.  
> Will update it, thanks.
> 
> >   
> > > +mf_mode  [DEVICE, DRIVER-SPECIFIC]
> > > + Configure Multi Function mode for the device.
> > > + Supported MF modes and the assoicated values are,
> > > + MF allowed(0), Default(1), SPIO4(2), NPAR1.0(3),
> > > + NPAR1.5(4), NPAR2.0(5), BD(6) and UFP(7)  
> > 
> > NPAR should have a proper API in devlink port, what are the other modes?
> >   
> These are the different modes supported by the Marvell NIC. In our
> case the mf_mode is per adapter basis, e.g., it's not possible to
> configure one port in NPAR mode and the other in Default mode.

Jiri, what are your thoughts on the NPAR support?  It is effectively a
PCI split.  If we are going to support mdev split, should we perhaps
have a "depth" or "type" of split and allow for users to configure it
using the same API?

> > > + Type: u8
> > > + Configuration mode: Permanent
> > > +
> > > +dcbx_mode[PORT, DRIVER-SPECIFIC]
> > > + Configure DCBX mode for the device.
> > > + Supported dcbx modes are,
> > > + Disabled(0), IEEE(1), CEE(2) and
> > > Dynamic(3)
> > > + Type: u8
> > > + Configuration mode: Permanent  
> > 
> > Why is this a permanent parameter?
> >   
> This specifies the dcbx_mode to be configured in non-volatile memory.
> The value is persistent and is used in the next load of OS or the mfw.

And it can't be changed at runtime?

1 2 >

1 - 100 of 168 matches

Mail list logo