date:20210108

Re: [PATCH 05/10] dma: tx49 removal

2021-01-08 Thread Vinod Koul

On 07-01-21, 17:40, Thomas Bogendoerfer wrote:
> On Wed, Jan 06, 2021 at 11:10:38AM -0800, Joe Perches wrote:
> > On Tue, 2021-01-05 at 15:02 +0100, Thomas Bogendoerfer wrote:
> > > Signed-off-by: Thomas Bogendoerfer 
> > []
> > > diff --git a/drivers/dma/txx9dmac.h b/drivers/dma/txx9dmac.h
> > []
> > > @@ -26,11 +26,6 @@
> > >   * DMA channel.
> > >   */
> > >  
> > > 
> > > -#ifdef CONFIG_MACH_TX49XX
> > > -static inline bool txx9_dma_have_SMPCHN(void)
> > > -{
> > > - return true;
> > > -}
> > >  #define TXX9_DMA_USE_SIMPLE_CHAIN
> > >  #else
> > >  static inline bool txx9_dma_have_SMPCHN(void)
> > 
> > This doesn't look like it compiles as there's now an #else
> > without an #if
> 
> you are right, no idea what I had in mind while doing that.
> 
> Vinod,
> 
> as this patch series found a still active user of the platform,
> could you drop the patch from your tree, or do you want a revert
> from me ?

Dropped now

-- 
~Vinod

Re: [PATCH v1] vdpa/mlx5: Fix memory key MTT population

2021-01-08 Thread Jason Wang




On 2021/1/7 下午3:18, Eli Cohen wrote:

map_direct_mr() assumed that the number of scatter/gather entries
returned by dma_map_sg_attrs() was equal to the number of segments in
the sgl list. This led to wrong population of the mkey object. Fix this
by properly referring to the returned value.

The hardware expects each MTT entry to contain the DMA address of a
contiguous block of memory of size (1 << mr->log_size) bytes.
dma_map_sg_attrs() can coalesce several sg entries into a single
scatter/gather entry of contiguous DMA range so we need to scan the list
and refer to the size of each s/g entry.

In addition, get rid of fill_sg() which effect is overwritten by
populate_mtts().

Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code")
Signed-off-by: Eli Cohen 
---
V0->V1:
1. Fix typos
2. Improve changelog



Acked-by: Jason Wang 




  drivers/vdpa/mlx5/core/mlx5_vdpa.h |  1 +
  drivers/vdpa/mlx5/core/mr.c| 28 
  2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 5c92a576edae..08f742fd2409 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -15,6 +15,7 @@ struct mlx5_vdpa_direct_mr {
struct sg_table sg_head;
int log_size;
int nsg;
+   int nent;
struct list_head list;
u64 offset;
  };
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 4b6195666c58..d300f799efcd 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -25,17 +25,6 @@ static int get_octo_len(u64 len, int page_shift)
return (npages + 1) / 2;
  }
  
-static void fill_sg(struct mlx5_vdpa_direct_mr *mr, void *in)

-{
-   struct scatterlist *sg;
-   __be64 *pas;
-   int i;
-
-   pas = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-   for_each_sg(mr->sg_head.sgl, sg, mr->nsg, i)
-   (*pas) = cpu_to_be64(sg_dma_address(sg));
-}
-
  static void mlx5_set_access_mode(void *mkc, int mode)
  {
MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
@@ -45,10 +34,18 @@ static void mlx5_set_access_mode(void *mkc, int mode)
  static void populate_mtts(struct mlx5_vdpa_direct_mr *mr, __be64 *mtt)
  {
struct scatterlist *sg;
+   int nsg = mr->nsg;
+   u64 dma_addr;
+   u64 dma_len;
+   int j = 0;
int i;
  
-	for_each_sg(mr->sg_head.sgl, sg, mr->nsg, i)

-   mtt[i] = cpu_to_be64(sg_dma_address(sg));
+   for_each_sg(mr->sg_head.sgl, sg, mr->nent, i) {
+   for (dma_addr = sg_dma_address(sg), dma_len = sg_dma_len(sg);
+nsg && dma_len;
+nsg--, dma_addr += BIT(mr->log_size), dma_len -= 
BIT(mr->log_size))
+   mtt[j++] = cpu_to_be64(dma_addr);
+   }
  }
  
  static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr)

@@ -64,7 +61,6 @@ static int create_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct
return -ENOMEM;
  
  	MLX5_SET(create_mkey_in, in, uid, mvdev->res.uid);

-   fill_sg(mr, in);
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, lw, !!(mr->perm & VHOST_MAP_WO));
MLX5_SET(mkc, mkc, lr, !!(mr->perm & VHOST_MAP_RO));
@@ -276,8 +272,8 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, 
struct mlx5_vdpa_direct_mr
  done:
mr->log_size = log_entity_size;
mr->nsg = nsg;
-   err = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, 
DMA_BIDIRECTIONAL, 0);
-   if (!err)
+   mr->nent = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, 
DMA_BIDIRECTIONAL, 0);
+   if (!mr->nent)
goto err_map;
  
  	err = create_direct_mr(mvdev, mr);

Re: question about i2c_transfer() function (regarding mdio-i2c on RollBall SFPs)

2021-01-08 Thread Wolfram Sang


> I thought as much, but maybe there is some driver which can offload
> whole i2c_transfer to HW, and has to pass the addresses of the buffers
> to the HW, and the HW can have problems if the buffers overlap
> somewhere...

Well, sure, you can never know what crazy HW is out there :) But that
shouldn't prevent us from doing legit I2C transfers. The likeliness of
such HW is low enough; They must process the whole transfer in one go
(rare) AND have the limitiation with the buffer pointers (at least I
don't know one) AND have no possibility of a fallback to a simpler mode
where they can handle the transfer per message. If such a controller
exists, it would need a new quirk flag, I'd say, and reject the
transfer. But that shouldn't stop you from fixing your issue.

Thanks for thinking thoroughly about drawbacks! Much appreciated.



signature.asc
Description: PGP signature

Re: [PATCH v3 net-next 10/12] net: bonding: ensure .ndo_get_stats64 can sleep

2021-01-08 Thread Vladimir Oltean

On Thu, Jan 07, 2021 at 07:59:37PM -0800, Saeed Mahameed wrote:
> On Thu, 2021-01-07 at 13:58 +0100, Eric Dumazet wrote:
> > On Thu, Jan 7, 2021 at 12:33 PM Vladimir Oltean 
> > wrote:
> > > On Thu, Jan 07, 2021 at 12:18:28PM +0100, Eric Dumazet wrote:
> > > > What a mess really.
> > >
> > > Thanks, that's at least _some_ feedback :)
> >
> > Yeah, I was on PTO for the last two weeks.
> >
> > > > You chose to keep the assumption that ndo_get_stats() would not
> > > > fail,
> > > > since we were providing the needed storage from callers.
> > > >
> > > > If ndo_get_stats() are now allowed to sleep, and presumably
> > > > allocate
> > > > memory, we need to make sure
> > > > we report potential errors back to the user.
> > > >
> > > > I think your patch series is mostly good, but I would prefer not
> > > > hiding errors and always report them to user space.
> > > > And no, netdev_err() is not appropriate, we do not want tools to
> > > > look
> > > > at syslog to guess something went wrong.
> > >
> > > Well, there are only 22 dev_get_stats callers in the kernel, so I
> > > assume
> > > that after the conversion to return void, I can do another
> > > conversion to
> > > return int, and then I can convert the ndo_get_stats64 method to
> > > return
> > > int too. I will keep the plain ndo_get_stats still void (no reason
> > > not
> > > to).
> > >
> > > > Last point about drivers having to go to slow path, talking to
> > > > firmware : Make sure that malicious/innocent users
> > > > reading /proc/net/dev from many threads in parallel wont brick
> > > > these devices.
> > > >
> > > > Maybe they implicitly _relied_ on the fact that firmware was
> > > > gently
> > > > read every second and results were cached from a work queue or
> > > > something.
> > >
> > > How? I don't understand how I can make sure of that.
> >
> > Your patches do not attempt to change these drivers, but I guess your
> > cover letter might send to driver maintainers incentive to get rid of
> > their
> > logic, that is all.
> >
> > We might simply warn maintainers and ask them to test their future
> > changes
> > with tests using 1000 concurrent theads reading /proc/net/dev
> >
> > > There is an effort initiated by Jakub to standardize the ethtool
> > > statistics. My objection was that you can't expect that to happen
> > > unless
> > > dev_get_stats is sleepable just like ethtool -S is. So I think the
> > > same
> > > reasoning should apply to ethtool -S too, really.
> >
> > I think we all agree on the principles, once we make sure to not
> > add more pressure on RTNL. It seems you addressed our feedback, all
> > is fine.
> >
>
> Eric, about two years ago you were totally against sleeping in
> ndo_get_stats, what happened ? :)
> https://lore.kernel.org/netdev/4cc44e85-cb5e-502c-30f3-c6ea564fe...@gmail.com/

I believe that what is different this time is that DSA switches are
typically connected over a slow and bottlenecked bus (so periodic
driver-level readouts would only make things worse for phc2sys and
such other latency-sensitive programs), plus they are offloading
interfaces for forwarding (so software-based counters could never be
accurate). Support those, and supporting firmware-based high-speed
devices will come as a nice side-effect.
FWIW that discussion took place here:
https://patchwork.ozlabs.org/project/netdev/patch/20201125193740.36825-3-george.mccollis...@gmail.com/

> My approach to solve this was much simpler and didn't require  a new
> mutex nor RTNL lock, all i did is to reduce the rcu critical section to
> not include the call to the driver by simply holding the netdev via
> dev_hold()

I feel this is a call for the bonding maintainers to make. If they're
willing to replace rtnl_dereference with bond_dereference throughout the
whole driver, and reduce other guys' amount of work when other NDOs
start losing the rtnl_mutex too, then I can't see what's wrong with my
approach (despite not being "as simple"). If they think that update-side
protection of the slaves array is just fine the way it is, then I
suppose that RCU protection + dev_hold is indeed all that I can do.

[PATCH net-next v2 0/6] dpaa2-mac: various updates

2021-01-08 Thread Ioana Ciornei

The first two patches of this series extends the MAC statistics support
to also work for network interfaces which have their link status handled
by firmware (TYPE_FIXED).

The next two patches are fixing a sporadic problem which happens when
the connected DPMAC object is not yet discovered by the fsl-mc bus, thus
the dpaa2-eth is not able to get a reference to it. A referred probe
will be requested in this case.

Finally, the last two patches make some cosmetic changes, mostly
removing comments and unnecessary checks.

Changes in v2:
 - replaced IS_ERR_OR_NULL() by IS_ERR() in patch 4/6
 - reworded the commit message of patch 6/6

Ioana Ciornei (6):
  dpaa2-mac: split up initializing the MAC object from connecting to it
  dpaa2-mac: export MAC counters even when in TYPE_FIXED
  bus: fsl-mc: return -EPROBE_DEFER when a device is not yet discovered
  dpaa2-eth: retry the probe when the MAC is not yet discovered on the
bus
  dpaa2-mac: remove an unnecessary check
  dpaa2-mac: remove a comment regarding pause settings

 drivers/bus/fsl-mc/fsl-mc-bus.c   |   9 ++
 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  |  53 ---
 .../net/ethernet/freescale/dpaa2/dpaa2-eth.h  |  13 ++
 .../ethernet/freescale/dpaa2/dpaa2-ethtool.c  |  16 +--
 .../net/ethernet/freescale/dpaa2/dpaa2-mac.c  | 135 --
 .../net/ethernet/freescale/dpaa2/dpaa2-mac.h  |   5 +
 6 files changed, 126 insertions(+), 105 deletions(-)

-- 
2.29.2

[PATCH net-next v2 1/6] dpaa2-mac: split up initializing the MAC object from connecting to it

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

Split up the initialization phase of the dpmac object from actually
configuring the phylink instance, connecting to it and configuring the
MAC. This is done so that even though the dpni object is connected to a
dpmac which has link management handled by the firmware we are still
able to export the MAC counters.

Signed-off-by: Ioana Ciornei 
---
Changes in v2:
 - none

 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  | 14 +++-
 .../net/ethernet/freescale/dpaa2/dpaa2-mac.c  | 69 +++
 .../net/ethernet/freescale/dpaa2/dpaa2-mac.h  |  5 ++
 3 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index fb0bcd18ec0c..61385894e8c7 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -4056,15 +4056,24 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv 
*priv)
mac->mc_io = priv->mc_io;
mac->net_dev = priv->net_dev;
 
+   err = dpaa2_mac_open(mac);
+   if (err)
+   goto err_free_mac;
+
err = dpaa2_mac_connect(mac);
if (err) {
netdev_err(priv->net_dev, "Error connecting to the MAC 
endpoint\n");
-   kfree(mac);
-   return err;
+   goto err_close_mac;
}
priv->mac = mac;
 
return 0;
+
+err_close_mac:
+   dpaa2_mac_close(mac);
+err_free_mac:
+   kfree(mac);
+   return err;
 }
 
 static void dpaa2_eth_disconnect_mac(struct dpaa2_eth_priv *priv)
@@ -4073,6 +4082,7 @@ static void dpaa2_eth_disconnect_mac(struct 
dpaa2_eth_priv *priv)
return;
 
dpaa2_mac_disconnect(priv->mac);
+   dpaa2_mac_close(priv->mac);
kfree(priv->mac);
priv->mac = NULL;
 }
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index 828c177df03d..50dd302abcf4 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -302,36 +302,20 @@ static void dpaa2_pcs_destroy(struct dpaa2_mac *mac)
 
 int dpaa2_mac_connect(struct dpaa2_mac *mac)
 {
-   struct fsl_mc_device *dpmac_dev = mac->mc_dev;
struct net_device *net_dev = mac->net_dev;
struct device_node *dpmac_node;
struct phylink *phylink;
-   struct dpmac_attr attr;
int err;
 
-   err = dpmac_open(mac->mc_io, 0, dpmac_dev->obj_desc.id,
-&dpmac_dev->mc_handle);
-   if (err || !dpmac_dev->mc_handle) {
-   netdev_err(net_dev, "dpmac_open() = %d\n", err);
-   return -ENODEV;
-   }
-
-   err = dpmac_get_attributes(mac->mc_io, 0, dpmac_dev->mc_handle, &attr);
-   if (err) {
-   netdev_err(net_dev, "dpmac_get_attributes() = %d\n", err);
-   goto err_close_dpmac;
-   }
-
-   mac->if_link_type = attr.link_type;
+   mac->if_link_type = mac->attr.link_type;
 
-   dpmac_node = dpaa2_mac_get_node(attr.id);
+   dpmac_node = dpaa2_mac_get_node(mac->attr.id);
if (!dpmac_node) {
-   netdev_err(net_dev, "No dpmac@%d node found.\n", attr.id);
-   err = -ENODEV;
-   goto err_close_dpmac;
+   netdev_err(net_dev, "No dpmac@%d node found.\n", mac->attr.id);
+   return -ENODEV;
}
 
-   err = dpaa2_mac_get_if_mode(dpmac_node, attr);
+   err = dpaa2_mac_get_if_mode(dpmac_node, mac->attr);
if (err < 0) {
err = -EINVAL;
goto err_put_node;
@@ -351,9 +335,9 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac)
goto err_put_node;
}
 
-   if (attr.link_type == DPMAC_LINK_TYPE_PHY &&
-   attr.eth_if != DPMAC_ETH_IF_RGMII) {
-   err = dpaa2_pcs_create(mac, dpmac_node, attr.id);
+   if (mac->attr.link_type == DPMAC_LINK_TYPE_PHY &&
+   mac->attr.eth_if != DPMAC_ETH_IF_RGMII) {
+   err = dpaa2_pcs_create(mac, dpmac_node, mac->attr.id);
if (err)
goto err_put_node;
}
@@ -389,8 +373,7 @@ int dpaa2_mac_connect(struct dpaa2_mac *mac)
dpaa2_pcs_destroy(mac);
 err_put_node:
of_node_put(dpmac_node);
-err_close_dpmac:
-   dpmac_close(mac->mc_io, 0, dpmac_dev->mc_handle);
+
return err;
 }
 
@@ -402,8 +385,40 @@ void dpaa2_mac_disconnect(struct dpaa2_mac *mac)
phylink_disconnect_phy(mac->phylink);
phylink_destroy(mac->phylink);
dpaa2_pcs_destroy(mac);
+}
+
+int dpaa2_mac_open(struct dpaa2_mac *mac)
+{
+   struct fsl_mc_device *dpmac_dev = mac->mc_dev;
+   struct net_device *net_dev = mac->net_dev;
+   int err;
+
+   err = dpmac_open(mac->mc_io, 0, dpmac_dev->obj_desc.id,
+&dpmac_dev->mc_handle);
+   if (err || !dpmac_dev->mc_handle) {
+   netdev_e

[PATCH net-next v2 2/6] dpaa2-mac: export MAC counters even when in TYPE_FIXED

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

If the network interface object is connected to a MAC of TYPE_FIXED, the
link status management is handled exclusively by the firmware. This does
not mean that the driver cannot access the MAC counters and export them
in ethtool.

For this to happen, we open the attached dpmac device and keep a pointer
to it in priv->mac. Because of this, all the checks in the driver of the
following form 'if (priv->mac)' have to be updated to actually check
the dpmac attribute and not rely on the presence of a non-NULL value.

Signed-off-by: Ioana Ciornei 
---
Changes in v2:
 - none

 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  | 37 +--
 .../net/ethernet/freescale/dpaa2/dpaa2-eth.h  | 13 +++
 .../ethernet/freescale/dpaa2/dpaa2-ethtool.c  | 16 
 .../net/ethernet/freescale/dpaa2/dpaa2-mac.c  | 26 -
 4 files changed, 39 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 61385894e8c7..f3f53e36aa00 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -1691,7 +1691,7 @@ static int dpaa2_eth_link_state_update(struct 
dpaa2_eth_priv *priv)
/* When we manage the MAC/PHY using phylink there is no need
 * to manually update the netif_carrier.
 */
-   if (priv->mac)
+   if (dpaa2_eth_is_type_phy(priv))
goto out;
 
/* Chech link state; speed / duplex changes are not treated yet */
@@ -1730,7 +1730,7 @@ static int dpaa2_eth_open(struct net_device *net_dev)
   priv->dpbp_dev->obj_desc.id, priv->bpid);
}
 
-   if (!priv->mac) {
+   if (!dpaa2_eth_is_type_phy(priv)) {
/* We'll only start the txqs when the link is actually ready;
 * make sure we don't race against the link up notification,
 * which may come immediately after dpni_enable();
@@ -1752,7 +1752,7 @@ static int dpaa2_eth_open(struct net_device *net_dev)
goto enable_err;
}
 
-   if (priv->mac)
+   if (dpaa2_eth_is_type_phy(priv))
phylink_start(priv->mac->phylink);
 
return 0;
@@ -1826,11 +1826,11 @@ static int dpaa2_eth_stop(struct net_device *net_dev)
int dpni_enabled = 0;
int retries = 10;
 
-   if (!priv->mac) {
+   if (dpaa2_eth_is_type_phy(priv)) {
+   phylink_stop(priv->mac->phylink);
+   } else {
netif_tx_stop_all_queues(net_dev);
netif_carrier_off(net_dev);
-   } else {
-   phylink_stop(priv->mac->phylink);
}
 
/* On dpni_disable(), the MC firmware will:
@@ -2115,7 +2115,7 @@ static int dpaa2_eth_ioctl(struct net_device *dev, struct 
ifreq *rq, int cmd)
if (cmd == SIOCSHWTSTAMP)
return dpaa2_eth_ts_ioctl(dev, rq, cmd);
 
-   if (priv->mac)
+   if (dpaa2_eth_is_type_phy(priv))
return phylink_mii_ioctl(priv->mac->phylink, rq, cmd);
 
return -EOPNOTSUPP;
@@ -4045,9 +4045,6 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv 
*priv)
if (IS_ERR_OR_NULL(dpmac_dev) || dpmac_dev->dev.type != 
&fsl_mc_bus_dpmac_type)
return 0;
 
-   if (dpaa2_mac_is_type_fixed(dpmac_dev, priv->mc_io))
-   return 0;
-
mac = kzalloc(sizeof(struct dpaa2_mac), GFP_KERNEL);
if (!mac)
return -ENOMEM;
@@ -4059,18 +4056,21 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv 
*priv)
err = dpaa2_mac_open(mac);
if (err)
goto err_free_mac;
+   priv->mac = mac;
 
-   err = dpaa2_mac_connect(mac);
-   if (err) {
-   netdev_err(priv->net_dev, "Error connecting to the MAC 
endpoint\n");
-   goto err_close_mac;
+   if (dpaa2_eth_is_type_phy(priv)) {
+   err = dpaa2_mac_connect(mac);
+   if (err) {
+   netdev_err(priv->net_dev, "Error connecting to the MAC 
endpoint\n");
+   goto err_close_mac;
+   }
}
-   priv->mac = mac;
 
return 0;
 
 err_close_mac:
dpaa2_mac_close(mac);
+   priv->mac = NULL;
 err_free_mac:
kfree(mac);
return err;
@@ -4078,10 +4078,9 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv 
*priv)
 
 static void dpaa2_eth_disconnect_mac(struct dpaa2_eth_priv *priv)
 {
-   if (!priv->mac)
-   return;
+   if (dpaa2_eth_is_type_phy(priv))
+   dpaa2_mac_disconnect(priv->mac);
 
-   dpaa2_mac_disconnect(priv->mac);
dpaa2_mac_close(priv->mac);
kfree(priv->mac);
priv->mac = NULL;
@@ -4111,7 +4110,7 @@ static irqreturn_t dpni_irq0_handler_thread(int irq_num, 
void *arg)
dpaa2_eth_update_tx_fqids(priv);
 
rtnl_lock();
-   if (priv->mac)
+

[PATCH net-next v2 4/6] dpaa2-eth: retry the probe when the MAC is not yet discovered on the bus

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

The fsl_mc_get_endpoint() function now returns -EPROBE_DEFER when the
dpmac device was not yet discovered by the fsl-mc bus. When this
happens, pass the error code up so that we can retry the probe at a
later time.

Signed-off-by: Ioana Ciornei 
---
Changes in v2:
 - replaced IS_ERR_OR_NULL() by IS_ERR()

 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index f3f53e36aa00..a8c98869e484 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -4042,7 +4042,11 @@ static int dpaa2_eth_connect_mac(struct dpaa2_eth_priv 
*priv)
 
dpni_dev = to_fsl_mc_device(priv->net_dev->dev.parent);
dpmac_dev = fsl_mc_get_endpoint(dpni_dev);
-   if (IS_ERR_OR_NULL(dpmac_dev) || dpmac_dev->dev.type != 
&fsl_mc_bus_dpmac_type)
+
+   if (PTR_ERR(dpmac_dev) == -EPROBE_DEFER)
+   return PTR_ERR(dpmac_dev);
+
+   if (IS_ERR(dpmac_dev) || dpmac_dev->dev.type != &fsl_mc_bus_dpmac_type)
return 0;
 
mac = kzalloc(sizeof(struct dpaa2_mac), GFP_KERNEL);
-- 
2.29.2

[PATCH net-next v2 5/6] dpaa2-mac: remove an unnecessary check

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

The dpaa2-eth driver has phylink integration only if the connected dpmac
object is in TYPE_PHY (aka the PCS/PHY etc link status is managed by
Linux instead of the firmware). The check is thus unnecessary because
the code path that reaches the .mac_link_up() callback is only with
TYPE_PHY dpmac objects.

Signed-off-by: Ioana Ciornei 
---
Changes in v2:
 - none

 .../net/ethernet/freescale/dpaa2/dpaa2-mac.c  | 43 ---
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index 81b2822a7dc9..3869c38f3979 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -174,30 +174,25 @@ static void dpaa2_mac_link_up(struct phylink_config 
*config,
 
dpmac_state->up = 1;
 
-   if (mac->if_link_type == DPMAC_LINK_TYPE_PHY) {
-   /* If the DPMAC is configured for PHY mode, we need
-* to pass the link parameters to the MC firmware.
-*/
-   dpmac_state->rate = speed;
-
-   if (duplex == DUPLEX_HALF)
-   dpmac_state->options |= DPMAC_LINK_OPT_HALF_DUPLEX;
-   else if (duplex == DUPLEX_FULL)
-   dpmac_state->options &= ~DPMAC_LINK_OPT_HALF_DUPLEX;
-
-   /* This is lossy; the firmware really should take the pause
-* enablement status rather than pause/asym pause status.
-*/
-   if (rx_pause)
-   dpmac_state->options |= DPMAC_LINK_OPT_PAUSE;
-   else
-   dpmac_state->options &= ~DPMAC_LINK_OPT_PAUSE;
-
-   if (rx_pause ^ tx_pause)
-   dpmac_state->options |= DPMAC_LINK_OPT_ASYM_PAUSE;
-   else
-   dpmac_state->options &= ~DPMAC_LINK_OPT_ASYM_PAUSE;
-   }
+   dpmac_state->rate = speed;
+
+   if (duplex == DUPLEX_HALF)
+   dpmac_state->options |= DPMAC_LINK_OPT_HALF_DUPLEX;
+   else if (duplex == DUPLEX_FULL)
+   dpmac_state->options &= ~DPMAC_LINK_OPT_HALF_DUPLEX;
+
+   /* This is lossy; the firmware really should take the pause
+* enablement status rather than pause/asym pause status.
+*/
+   if (rx_pause)
+   dpmac_state->options |= DPMAC_LINK_OPT_PAUSE;
+   else
+   dpmac_state->options &= ~DPMAC_LINK_OPT_PAUSE;
+
+   if (rx_pause ^ tx_pause)
+   dpmac_state->options |= DPMAC_LINK_OPT_ASYM_PAUSE;
+   else
+   dpmac_state->options &= ~DPMAC_LINK_OPT_ASYM_PAUSE;
 
err = dpmac_set_link_state(mac->mc_io, 0,
   mac->mc_dev->mc_handle, dpmac_state);
-- 
2.29.2

Re: [PATCH net 3/3] net-sysfs: move the xps cpus/rxqs retrieval in a common function

2021-01-08 Thread Antoine Tenart

Quoting Alexander Duyck (2021-01-07 17:38:35)
> On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart  wrote:
> >
> > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart  wrote:
> >
> > That would require to hold rcu_read_lock in the caller and I'd like to
> > keep it in that function.
> 
> Actually you could probably make it work if you were to pass a pointer
> to the RCU pointer.

That should work but IMHO that could be easily breakable by future
changes as it's a bit tricky.

> > > > if (dev->num_tc) {
> > > > /* Do not allow XPS on subordinate device directly */
> > > > num_tc = dev->num_tc;
> > > > -   if (num_tc < 0) {
> > > > -   ret = -EINVAL;
> > > > -   goto err_rtnl_unlock;
> > > > -   }
> > > > +   if (num_tc < 0)
> > > > +   return -EINVAL;
> > > >
> > > > /* If queue belongs to subordinate dev use its map */
> > > > dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > >
> > > > tc = netdev_txq_to_tc(dev, index);
> > > > -   if (tc < 0) {
> > > > -   ret = -EINVAL;
> > > > -   goto err_rtnl_unlock;
> > > > -   }
> > > > +   if (tc < 0)
> > > > +   return -EINVAL;
> > > > }
> > > >
> > >
> > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > could simplify this a bit by pulling the num_tc info out of the
> > > dev_map and only asking the Tx queue for the tc in that case and
> > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > if either are true.
> > >
> > > This would also allow us to address the fact that the rxqs feature
> > > doesn't support the subordinate devices as you could pull out the bit
> > > above related to the sb_dev and instead call that prior to calling
> > > xps_queue_show so that you are operating on the correct device map.
> 
> It probably would be necessary to pass dev and index if we pull the
> netdev_get_tx_queue()->sb_dev bit out and performed that before we
> called the xps_queue_show function. Specifically as the subordinate
> device wouldn't match up with the queue device so we would be better
> off pulling it out first.

While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
xps_queue_show seems like a good idea for consistency, I'm not sure
it'll work: dev->num_tc is not only used to retrieve the number of tc
but also as a condition on not being 0. We have things like:

  // Always done with the original dev.
  if (dev->num_tc) {

  [...]

  // Can be a subordinate dev.
  tc = netdev_txq_to_tc(dev, index);
  }

And after moving num_tc in the map, we'll have checks like:

  if (dev_maps->num_tc != dev->num_tc)
  return -EINVAL;

I don't think the subordinate dev holds the same num_tc value as dev.
What's your take on this?

> > > > -   mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > > > -   if (!mask) {
> > > > -   ret = -ENOMEM;
> > > > -   goto err_rtnl_unlock;
> > > > +   rcu_read_lock();
> > > > +
> > > > +   if (is_rxqs_map) {
> > > > +   dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > > > +   nr_ids = dev->num_rx_queues;
> > > > +   } else {
> > > > +   dev_maps = rcu_dereference(dev->xps_cpus_map);
> > > > +   nr_ids = nr_cpu_ids;
> > > > +   if (num_possible_cpus() > 1)
> > > > +   possible_mask = cpumask_bits(cpu_possible_mask);
> > > > }
> > >
> 
> I don't think we need the possible_mask check. That is mostly just an
> optimization that was making use of an existing "for_each" loop macro.
> If we are going to go through 0 through nr_ids then there is no need
> for the possible_mask as we can just brute force our way through and
> will not find CPU that aren't there since we couldn't have added them
> to the map anyway.

I'll remove it then. __netif_set_xps_queue could also benefit from it.

> > > I think Jakub had mentioned earlier the idea of possibly moving some
> > > fields into the xps_cpus_map and xps_rxqs_map in order to reduce the
> > > complexity of this so that certain values would be protected by the
> > > RCU lock.
> > >
> > > This might be a good time to look at encoding things like the number
> > > of IDs and the number of TCs there in order to avoid a bunch of this
> > > duplication. Then you could just pass a pointer to the map you want to
> > > display and the code should be able to just dump the values.:
> >
> > 100% agree to all the above. That would also prevent from making out of
> > bound accesses when dev->num_tc is increased after dev_maps is
> > allocated. I do have a series ready to be send storing num_tc into the
> > maps, and reworking code to use it instead of dev->num_tc. The series
>

[PATCH net-next v2 3/6] bus: fsl-mc: return -EPROBE_DEFER when a device is not yet discovered

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

The fsl_mc_get_endpoint() should return a pointer to the connected
fsl_mc device, if there is one. By interrogating the MC firmware, we
know if there is an endpoint or not so when the endpoint device is
actually searched on the fsl-mc bus and not found we are hitting the
case in which the device has not been yet discovered by the bus.

Return -EPROBE_DEFER so that callers can differentiate this case.

Signed-off-by: Ioana Ciornei 
Acked-by: Laurentiu Tudor 
---
Changes in v2:
 - none

 drivers/bus/fsl-mc/fsl-mc-bus.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 34811db074b7..28d5da1c011c 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -936,6 +936,15 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct 
fsl_mc_device *mc_dev)
endpoint_desc.id = endpoint2.id;
endpoint = fsl_mc_device_lookup(&endpoint_desc, mc_bus_dev);
 
+   /*
+* We know that the device has an endpoint because we verified by
+* interrogating the firmware. This is the case when the device was not
+* yet discovered by the fsl-mc bus, thus the lookup returned NULL.
+* Differentiate this case by returning EPROBE_DEFER.
+*/
+   if (!endpoint)
+   return ERR_PTR(-EPROBE_DEFER);
+
return endpoint;
 }
 EXPORT_SYMBOL_GPL(fsl_mc_get_endpoint);
-- 
2.29.2

[PATCH net-next v2 6/6] dpaa2-mac: remove a comment regarding pause settings

2021-01-08 Thread Ioana Ciornei

From: Ioana Ciornei 

The MC firmware takes these PAUSE/ASYM_PAUSE flags provided by the
driver, transforms them back into rx/tx pause enablement status and
applies them to hardware. We are not losing information by this
transformation, thus remove the comment.

Signed-off-by: Ioana Ciornei 
---
Changes in v2:
 - reworded the commit message

 drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c 
b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index 3869c38f3979..69ad869446cf 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -181,9 +181,6 @@ static void dpaa2_mac_link_up(struct phylink_config *config,
else if (duplex == DUPLEX_FULL)
dpmac_state->options &= ~DPMAC_LINK_OPT_HALF_DUPLEX;
 
-   /* This is lossy; the firmware really should take the pause
-* enablement status rather than pause/asym pause status.
-*/
if (rx_pause)
dpmac_state->options |= DPMAC_LINK_OPT_PAUSE;
else
-- 
2.29.2

Re: [PATCH v3 net-next 10/12] net: bonding: ensure .ndo_get_stats64 can sleep

2021-01-08 Thread Vladimir Oltean

On Fri, Jan 08, 2021 at 10:14:01AM +0100, Eric Dumazet wrote:
> If you disagree, repost a rebased patch series so that we can
> test/compare and choose the best solution.

I would rather use Saeed's time as a reviewer to my existing and current
patch set.

[net-next PATCH v13 0/4] Add support for mv88e6393x family of Marvell

2021-01-08 Thread Pavana Sharma

Updated patchset with fixes for enbling IRQ for 5GBASER and 10BASER.

Pavana Sharma (4):
  dt-bindings: net: Add 5GBASER phy interface
  net: phy: Add 5GBASER interface mode
  net: dsa: mv88e6xxx: Change serdes lane parameter type from u8 type to
int
  net: dsa: mv88e6xxx: Add support for mv88e6393x family of Marvell

 .../bindings/net/ethernet-controller.yaml |   1 +
 drivers/net/dsa/mv88e6xxx/chip.c  | 164 ++-
 drivers/net/dsa/mv88e6xxx/chip.h  |  20 +-
 drivers/net/dsa/mv88e6xxx/global1.h   |   2 +
 drivers/net/dsa/mv88e6xxx/global2.h   |   8 +
 drivers/net/dsa/mv88e6xxx/port.c  | 238 ++-
 drivers/net/dsa/mv88e6xxx/port.h  |  43 +-
 drivers/net/dsa/mv88e6xxx/serdes.c| 401 --
 drivers/net/dsa/mv88e6xxx/serdes.h| 108 +++--
 include/linux/phy.h   |   4 +
 10 files changed, 888 insertions(+), 101 deletions(-)

-- 
2.17.1

[net-next PATCH v13 1/4] dt-bindings: net: Add 5GBASER phy interface

2021-01-08 Thread Pavana Sharma

Add 5gbase-r PHY interface mode.

Signed-off-by: Pavana Sharma 
Acked-by: Rob Herring 
---
 Documentation/devicetree/bindings/net/ethernet-controller.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml 
b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
index 0965f6515f9e..5507ae3c478d 100644
--- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -89,6 +89,7 @@ properties:
   - trgmii
   - 1000base-x
   - 2500base-x
+  - 5gbase-r
   - rxaui
   - xaui
 
-- 
2.17.1

[net-next PATCH v13 2/4] net: phy: Add 5GBASER interface mode

2021-01-08 Thread Pavana Sharma

Add 5GBASE-R phy interface mode

Signed-off-by: Pavana Sharma 
---
 include/linux/phy.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9effb511acde..548372eb253a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -106,6 +106,7 @@ extern const int phy_10gbit_features_array[1];
  * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII
  * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX
  * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX
+ * @PHY_INTERFACE_MODE_5GBASER: 5G BaseR
  * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
  * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface
  * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR
@@ -137,6 +138,7 @@ typedef enum {
PHY_INTERFACE_MODE_TRGMII,
PHY_INTERFACE_MODE_1000BASEX,
PHY_INTERFACE_MODE_2500BASEX,
+   PHY_INTERFACE_MODE_5GBASER,
PHY_INTERFACE_MODE_RXAUI,
PHY_INTERFACE_MODE_XAUI,
/* 10GBASE-R, XFI, SFI - single lane 10G Serdes */
@@ -207,6 +209,8 @@ static inline const char *phy_modes(phy_interface_t 
interface)
return "1000base-x";
case PHY_INTERFACE_MODE_2500BASEX:
return "2500base-x";
+   case PHY_INTERFACE_MODE_5GBASER:
+   return "5gbase-r";
case PHY_INTERFACE_MODE_RXAUI:
return "rxaui";
case PHY_INTERFACE_MODE_XAUI:
-- 
2.17.1

[net-next PATCH v13 3/4] net: dsa: mv88e6xxx: Change serdes lane parameter type from u8 type to int

2021-01-08 Thread Pavana Sharma

Returning 0 is no more an error case with MV88E6393 family
which has serdes lane numbers 0, 9 or 10.
So with this change .serdes_get_lane will return lane number
or -errno (-ENODEV or -EOPNOTSUPP).

Signed-off-by: Pavana Sharma 
Reviewed-by: Andrew Lunn 
---
 drivers/net/dsa/mv88e6xxx/chip.c   | 28 +-
 drivers/net/dsa/mv88e6xxx/chip.h   | 16 +++---
 drivers/net/dsa/mv88e6xxx/port.c   |  8 +--
 drivers/net/dsa/mv88e6xxx/serdes.c | 82 +++---
 drivers/net/dsa/mv88e6xxx/serdes.h | 64 +++
 5 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index eafe6bedc692..9bddd70449c6 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -485,12 +485,12 @@ static int mv88e6xxx_serdes_pcs_get_state(struct 
dsa_switch *ds, int port,
  struct phylink_link_state *state)
 {
struct mv88e6xxx_chip *chip = ds->priv;
-   u8 lane;
+   int lane;
int err;
 
mv88e6xxx_reg_lock(chip);
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (lane && chip->info->ops->serdes_pcs_get_state)
+   if (lane >= 0 && chip->info->ops->serdes_pcs_get_state)
err = chip->info->ops->serdes_pcs_get_state(chip, port, lane,
state);
else
@@ -506,11 +506,11 @@ static int mv88e6xxx_serdes_pcs_config(struct 
mv88e6xxx_chip *chip, int port,
   const unsigned long *advertise)
 {
const struct mv88e6xxx_ops *ops = chip->info->ops;
-   u8 lane;
+   int lane;
 
if (ops->serdes_pcs_config) {
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (lane)
+   if (lane >= 0)
return ops->serdes_pcs_config(chip, port, lane, mode,
  interface, advertise);
}
@@ -523,14 +523,14 @@ static void mv88e6xxx_serdes_pcs_an_restart(struct 
dsa_switch *ds, int port)
struct mv88e6xxx_chip *chip = ds->priv;
const struct mv88e6xxx_ops *ops;
int err = 0;
-   u8 lane;
+   int lane;
 
ops = chip->info->ops;
 
if (ops->serdes_pcs_an_restart) {
mv88e6xxx_reg_lock(chip);
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (lane)
+   if (lane >= 0)
err = ops->serdes_pcs_an_restart(chip, port, lane);
mv88e6xxx_reg_unlock(chip);
 
@@ -544,11 +544,11 @@ static int mv88e6xxx_serdes_pcs_link_up(struct 
mv88e6xxx_chip *chip, int port,
int speed, int duplex)
 {
const struct mv88e6xxx_ops *ops = chip->info->ops;
-   u8 lane;
+   int lane;
 
if (!phylink_autoneg_inband(mode) && ops->serdes_pcs_link_up) {
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (lane)
+   if (lane >= 0)
return ops->serdes_pcs_link_up(chip, port, lane,
   speed, duplex);
}
@@ -2431,11 +2431,11 @@ static irqreturn_t mv88e6xxx_serdes_irq_thread_fn(int 
irq, void *dev_id)
struct mv88e6xxx_chip *chip = mvp->chip;
irqreturn_t ret = IRQ_NONE;
int port = mvp->port;
-   u8 lane;
+   int lane;
 
mv88e6xxx_reg_lock(chip);
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (lane)
+   if (lane >= 0)
ret = mv88e6xxx_serdes_irq_status(chip, port, lane);
mv88e6xxx_reg_unlock(chip);
 
@@ -2443,7 +2443,7 @@ static irqreturn_t mv88e6xxx_serdes_irq_thread_fn(int 
irq, void *dev_id)
 }
 
 static int mv88e6xxx_serdes_irq_request(struct mv88e6xxx_chip *chip, int port,
-   u8 lane)
+   int lane)
 {
struct mv88e6xxx_port *dev_id = &chip->ports[port];
unsigned int irq;
@@ -2472,7 +2472,7 @@ static int mv88e6xxx_serdes_irq_request(struct 
mv88e6xxx_chip *chip, int port,
 }
 
 static int mv88e6xxx_serdes_irq_free(struct mv88e6xxx_chip *chip, int port,
-u8 lane)
+int lane)
 {
struct mv88e6xxx_port *dev_id = &chip->ports[port];
unsigned int irq = dev_id->serdes_irq;
@@ -2497,11 +2497,11 @@ static int mv88e6xxx_serdes_irq_free(struct 
mv88e6xxx_chip *chip, int port,
 static int mv88e6xxx_serdes_power(struct mv88e6xxx_chip *chip, int port,
  bool on)
 {
-   u8 lane;
+   int lane;
int err;
 
lane = mv88e6xxx_serdes_get_lane(chip, port);
-   if (!lane)
+   if (lane < 0)
return 0;
 
if (on) {
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 35430

Re: [PATCH v4 net-next 16/18] net: bonding: ensure .ndo_get_stats64 can sleep

2021-01-08 Thread Nikolay Aleksandrov

On 08/01/2021 02:20, Vladimir Oltean wrote:
> From: Vladimir Oltean 
> 
> There is an effort to convert .ndo_get_stats64 to sleepable context, and
> for that to work, we need to prevent callers of dev_get_stats from using
> atomic locking.
> 
> The bonding driver retrieves its statistics recursively from its lower
> interfaces, with additional care to only count packets sent/received
> while those lowers were actually enslaved to the bond - see commit
> 5f0c5f73e5ef ("bonding: make global bonding stats more reliable").
> 
> Since commit 87163ef9cda7 ("bonding: remove last users of bond->lock and
> bond->lock itself"), the bonding driver uses the following protection
> for its array of slaves: RCU for readers and rtnl_mutex for updaters.
> This is not great because there is another movement [ somehow
> simultaneous with the one to make .ndo_get_stats64 sleepable ] to reduce
> driver usage of rtnl_mutex. This makes sense, because the rtnl_mutex has
> become a very contended resource.
> 
> The aforementioned commit removed an interesting comment:
> 
>   /* [...] we can't hold bond->lock [...] because we'll
>* deadlock. The only solution is to rely on the fact
>* that we're under rtnl_lock here, and the slaves
>* list won't change. This doesn't solve the problem
>* of setting the slave's MTU while it is
>* transmitting, but the assumption is that the base
>* driver can handle that.
>*
>* TODO: figure out a way to safely iterate the slaves
>* list, but without holding a lock around the actual
>* call to the base driver.
>*/
> 
> The above summarizes pretty well the challenges we have with nested
> bonding interfaces (bond over bond over bond over...), which need to be
> addressed by a better locking scheme that also not relies on the bloated
> rtnl_mutex.
> 
> Instead of using something as broad as the rtnl_mutex to ensure
> serialization of updates to the slave array, we can reintroduce a
> private mutex in the bonding driver, called slaves_lock.
> This mutex circles the only updater, bond_update_slave_arr, and ensures
> that whatever other readers want to see a consistent slave array, they
> don't need to hold the rtnl_mutex for that.
> 
> Now _of_course_ I did not convert the entire driver to use
> bond_for_each_slave protected by the bond->slaves_lock, and
> rtnl_dereference to bond_dereference. I just started that process by
> converting the one reader I needed: ndo_get_stats64. Not only is it nice
> to not hold rtnl_mutex in .ndo_get_stats64, but it is also in fact
> forbidden to do so (since top-level callers may hold netif_lists_lock,
> which is a sub-lock of the rtnl_mutex, and therefore this would cause a
> lock inversion and a deadlock).
> 
> To solve the nesting problem, the simple way is to not hold any locks
> when recursing into the slave netdev operation, which is exactly the
> approach that we take. We can "cheat" and use dev_hold to take a
> reference on the slave net_device, which is enough to ensure that
> netdev_wait_allrefs() waits until we finish, and the kernel won't fault.
> However, the slave structure might no longer be valid, just its
> associated net_device. That isn't a biggie. We just need to do some more
> work to ensure that the slave exists after we took the statistics, and
> if it still does, reapply the logic from Andy's commit 5f0c5f73e5ef.
> 
> Signed-off-by: Vladimir Oltean 
> ---
> Changes in v4:
> Now there is code to propagate errors.
> 
> Changes in v3:
> - Added kfree(dev_stats);
> - Removed the now useless stats_lock (bond->bond_stats and
>   slave->slave_stats are now protected by bond->slaves_lock)
> 
> Changes in v2:
> Switched to the new scheme of holding just a refcnt to the slave
> interfaces while recursing with dev_get_stats.
> 
>  drivers/net/bonding/bond_main.c | 113 ++--
>  include/net/bonding.h   |  49 +-
>  2 files changed, 99 insertions(+), 63 deletions(-)
> 
[snip]
> +static inline int bond_get_slave_arr(struct bonding *bond,
> +  struct net_device ***slaves,
> +  int *num_slaves)
> +{
> + struct net *net = dev_net(bond->dev);
> + struct list_head *iter;
> + struct slave *slave;
> + int i = 0;
> +
> + mutex_lock(&bond->slaves_lock);
> +
> + *slaves = kcalloc(bond->slave_cnt, sizeof(*slaves), GFP_KERNEL);
> + if (!(*slaves)) {
> + netif_lists_unlock(net);
> + return -ENOMEM;
> + }

The error path looks wrong, you unlock netif_lists and return with slaves_lock 
held.

Cheers,
 Nik

> +
> + bond_for_each_slave(bond, slave, iter) {
> + dev_hold(slave->dev);
> + *slaves[i++] = slave->dev;
> + }
> +
> + *num_slaves = bond->slave_cnt;
> +
> + mutex_unlock(&bond->slaves_lock);
> +
> + return 0;
> +}
> +
> +static inline void bond_put_slave_arr(struct net_device

[PATCH] rndis_host: set proper input size for OID_GEN_PHYSICAL_MEDIUM request

2021-01-08 Thread Andrey Zhizhikin

MSFT ActiveSync implementation requires that the size of the response for
incoming query is to be provided in the request input length. Failure to
set the input size proper results in failed request transfer, where the
ActiveSync counterpart reports the NDIS_STATUS_INVALID_LENGTH (0xC0010014L)
error.

Set the input size for OID_GEN_PHYSICAL_MEDIUM query to the expected size
of the response in order for the ActiveSync to properly respond to the
request.

Fixes: 039ee17d1baa ("rndis_host: Add RNDIS physical medium checking into 
generic_rndis_bind()")
Signed-off-by: Andrey Zhizhikin 
---
 drivers/net/usb/rndis_host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index 6609d21ef894..f813ca9dec53 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -387,7 +387,7 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface 
*intf, int flags)
reply_len = sizeof *phym;
retval = rndis_query(dev, intf, u.buf,
 RNDIS_OID_GEN_PHYSICAL_MEDIUM,
-0, (void **) &phym, &reply_len);
+reply_len, (void **)&phym, &reply_len);
if (retval != 0 || !phym) {
/* OID is optional so don't fail here. */
phym_unspec = cpu_to_le32(RNDIS_PHYSICAL_MEDIUM_UNSPECIFIED);
-- 
2.25.1

Re: [net-next 15/19] can: tcan4x5x: rework SPI access

2021-01-08 Thread Ahmad Fatoum

On 07.01.21 23:38, Jakub Kicinski wrote:
> On Thu, 7 Jan 2021 22:17:15 +0100 Marc Kleine-Budde wrote:
>>> +struct __packed tcan4x5x_buf_cmd { 
>>> +   u8 cmd; 
>>> +   __be16 addr; 
>>> +   u8 len; 
>>> +};   
>>
>> This has to be packed, as I assume the compiler would add some space after 
>> the
>> "u8 cmd" to align the __be16 naturally.
> 
> Ack, packing this one makes sense.
> 
>>> +struct __packed tcan4x5x_map_buf { 
>>> +   struct tcan4x5x_buf_cmd cmd; 
>>> +   u8 data[256 * sizeof(u32)]; 
>>> +} cacheline_aligned;   
>>
>> Due to the packing of the struct tcan4x5x_buf_cmd it should have a length of 
>> 4
>> bytes. Without __packed, will the "u8 data" come directly after the cmd?
> 
> Yup, u8 with no alignment attribute will follow the previous
> field with no holes.

__packed has a documentation benefit though. It documents that the author
considers the current layout to be the only correct one. (and thus extra
care should be taken when modifying it).

> 
> 

-- 
Pengutronix e.K.   | |
Steuerwalder Str. 21   | http://www.pengutronix.de/  |
31137 Hildesheim, Germany  | Phone: +49-5121-206917-0|
Amtsgericht Hildesheim, HRA 2686   | Fax:   +49-5121-206917- |

Re: [PATCH net v3] net: fix use-after-free when UDP GRO with shared fraglist

2021-01-08 Thread Daniel Borkmann


On 1/8/21 3:28 AM, Dongseok Yi wrote:

skbs in fraglist could be shared by a BPF filter loaded at TC. If TC
writes, it will call skb_ensure_writable -> pskb_expand_head to create
a private linear section for the head_skb. And then call
skb_clone_fraglist -> skb_get on each skb in the fraglist.

skb_segment_list overwrites part of the skb linear section of each
fragment itself. Even after skb_clone, the frag_skbs share their
linear section with their clone in PF_PACKET.

Both sk_receive_queue of PF_PACKET and PF_INET (or PF_INET6) can have
a link for the same frag_skbs chain. If a new skb (not frags) is
queued to one of the sk_receive_queue, multiple ptypes can see and
release this. It causes use-after-free.

[ 4443.426215] [ cut here ]
[ 4443.426222] refcount_t: underflow; use-after-free.
[ 4443.426291] WARNING: CPU: 7 PID: 28161 at lib/refcount.c:190
refcount_dec_and_test_checked+0xa4/0xc8
[ 4443.426726] pstate: 6045 (nZCv daif +PAN -UAO)
[ 4443.426732] pc : refcount_dec_and_test_checked+0xa4/0xc8
[ 4443.426737] lr : refcount_dec_and_test_checked+0xa0/0xc8
[ 4443.426808] Call trace:
[ 4443.426813]  refcount_dec_and_test_checked+0xa4/0xc8
[ 4443.426823]  skb_release_data+0x144/0x264
[ 4443.426828]  kfree_skb+0x58/0xc4
[ 4443.426832]  skb_queue_purge+0x64/0x9c
[ 4443.426844]  packet_set_ring+0x5f0/0x820
[ 4443.426849]  packet_setsockopt+0x5a4/0xcd0
[ 4443.426853]  __sys_setsockopt+0x188/0x278
[ 4443.426858]  __arm64_sys_setsockopt+0x28/0x38
[ 4443.426869]  el0_svc_common+0xf0/0x1d0
[ 4443.426873]  el0_svc_handler+0x74/0x98
[ 4443.426880]  el0_svc+0x8/0xc

Fixes: 3a1296a38d0c (net: Support GRO/GSO fraglist chaining.)
Signed-off-by: Dongseok Yi 
Acked-by: Willem de Bruijn 


Acked-by: Daniel Borkmann

Re: [PATCH 1/5] vsock/virtio: support for SOCK_SEQPACKET socket.

2021-01-08 Thread Stefano Garzarella


On Sun, Jan 03, 2021 at 10:57:50PM +0300, Arseny Krasnov wrote:

This extends rx loop for SOCK_SEQPACKET packets and implements
callback which user calls to copy data to its buffer.


Please write a better commit message explaining the changes, e.g. that 
you are using 'flags' to transport lengths when 'op' ==

VIRTIO_VSOCK_OP_SEQ_BEGIN.



Signed-off-by: Arseny Krasnov 
---
include/linux/virtio_vsock.h|   7 +
include/net/af_vsock.h  |   4 +
include/uapi/linux/virtio_vsock.h   |   9 +
net/vmw_vsock/virtio_transport.c|   3 +
net/vmw_vsock/virtio_transport_common.c | 323 +---
5 files changed, 305 insertions(+), 41 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..4902d71b3252 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -36,6 +36,10 @@ struct virtio_vsock_sock {
u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue;
+
+   /* For SOCK_SEQPACKET */
+   u32 user_read_seq_len;
+   u32 user_read_copied;
};

struct virtio_vsock_pkt {
@@ -80,6 +84,9 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+bool virtio_transport_seqpacket_seq_send_len(struct vsock_sock *vsk, size_t 
len);
+size_t virtio_transport_seqpacket_seq_get_len(struct vsock_sock *vsk);
+
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b1c717286993..792ea7b66574 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -135,6 +135,10 @@ struct vsock_transport {
bool (*stream_is_active)(struct vsock_sock *);
bool (*stream_allow)(u32 cid, u32 port);

+   /* SEQ_PACKET. */
+   bool (*seqpacket_seq_send_len)(struct vsock_sock *, size_t len);
+   size_t (*seqpacket_seq_get_len)(struct vsock_sock *);
+


These changes are related to the vsock core, so please move to another 
patch.



/* Notification. */
int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 1d57ed3d84d2..058908bc19fc 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -65,6 +65,7 @@ struct virtio_vsock_hdr {

enum virtio_vsock_type {
VIRTIO_VSOCK_TYPE_STREAM = 1,
+   VIRTIO_VSOCK_TYPE_SEQPACKET = 2,
};

enum virtio_vsock_op {
@@ -83,6 +84,9 @@ enum virtio_vsock_op {
VIRTIO_VSOCK_OP_CREDIT_UPDATE = 6,
/* Request the peer to send the credit info to us */
VIRTIO_VSOCK_OP_CREDIT_REQUEST = 7,
+
+   /* Record begin for SOCK_SEQPACKET */
+   VIRTIO_VSOCK_OP_SEQ_BEGIN = 8,
};

/* VIRTIO_VSOCK_OP_SHUTDOWN flags values */
@@ -91,4 +95,9 @@ enum virtio_vsock_shutdown {
VIRTIO_VSOCK_SHUTDOWN_SEND = 2,
};

+/* VIRTIO_VSOCK_OP_RW flags values for SOCK_SEQPACKET type */
+enum virtio_vsock_rw_seqpacket {
+   VIRTIO_VSOCK_RW_EOR = 1,
+};
+
#endif /* _UAPI_LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 2700a63ab095..2bd3f7cbffcb 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -469,6 +469,9 @@ static struct virtio_transport virtio_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,

+   .seqpacket_seq_send_len   = 
virtio_transport_seqpacket_seq_send_len,
+   .seqpacket_seq_get_len= 
virtio_transport_seqpacket_seq_get_len,
+
.notify_poll_in   = virtio_transport_notify_poll_in,
.notify_poll_out  = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 5956939eebb7..77c42004e422 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -139,6 +139,7 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)
break;
case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
+   case VIRTIO_VSOCK_OP_SEQ_BEGIN:
hdr->op = cpu_to_le16(AF_VSOCK_OP_CONTROL);
break;
default:
@@ -157,6 +158,10 @@ static struct sk_buff *virtio_transport_build_skb(void 
*opaque)

void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)
{
+   /* TODO: implement tap support for SOCK_SEQPACKET. */
+   if (le32_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_SEQPACKET)
+

Re: [PATCH v4 net-next 08/18] net: make dev_get_stats return void

2021-01-08 Thread Vladimir Oltean

On Fri, Jan 08, 2021 at 11:14:50AM +0100, Eric Dumazet wrote:
> On Fri, Jan 8, 2021 at 1:20 AM Vladimir Oltean  wrote:
> >
> > From: Vladimir Oltean 
> >
> > After commit 28172739f0a2 ("net: fix 64 bit counters on 32 bit arches"),
> > dev_get_stats got an additional argument for storage of statistics. At
> > this point, dev_get_stats could return either the passed "storage"
> > argument, or the output of .ndo_get_stats64.
> >
> > Then commit caf586e5f23c ("net: add a core netdev->rx_dropped counter")
> > came, and the output of .ndo_get_stats64 (still returning a pointer to
> > struct rtnl_link_stats64) started being ignored.
> >
> > Then came commit bc1f44709cf2 ("net: make ndo_get_stats64 a void
> > function") which made .ndo_get_stats64 stop returning anything.
> >
> > So now, dev_get_stats always reports the "storage" pointer received as
> > argument. This is useless. Some drivers are dealing with unnecessary
> > complexity due to this, so refactor them to ignore the return value
> > completely.
> >
> > Signed-off-by: Vladimir Oltean 
> > ---
> >
> 
> This seems like a lot of code churn.

Not sure there's something I can do to avoid that.

> Ultimately we need this function to return an error code, so why keep
> this patch with a void return ?
> 
> Please squash your patches a bit, to avoid having 18 patches to review.

Because the "make dev_get_stats return void" patch changes the callers
to poke through their stack-supplied struct rtnl_link_stats64 instead of
through the returned pointer. So all changes within this patch are of
the same type: replace a pointer dereference with a plain structure
field access. Whereas the "allow ndo_get_stats64 to return an int error
code" touches a completely unrelated portion: the ndo_get_stats64
callback. Again, that patch does one thing and one thing only. Then
there's the error checking, which is split in 3 patches:
- Special cases with non-trivial work to do: FCoE, OVS
- Propagation of errors from dev_get_stats
- Termination of errors from dev_get_stats

So you would like me to squash what exactly? I know there's a lot of
patches, but if I go ahead and combine them, it'll be even more
difficult to review, due to the mix of types of changes being applied.

> Additionally I would suggest a __must_check attribute on
> dev_get_stats() to make sure we converted all callers.

Ok, but that will mean even more patches (since the error checking is
done in 3 stages, the __must_check must be put at the end). And remember,
the inflation of this series from 12 to 18 patches came from your
suggestion to propagate the errors now.

> I can not convince myself that after your patches, bonding does the
> right thing...

Why?

Re: [PATCH net v2] net: fix use-after-free when UDP GRO with shared fraglist

2021-01-08 Thread Daniel Borkmann


On 1/7/21 3:44 PM, Willem de Bruijn wrote:

On Thu, Jan 7, 2021 at 8:33 AM Daniel Borkmann  wrote:

On 1/7/21 2:05 PM, Willem de Bruijn wrote:

On Thu, Jan 7, 2021 at 7:52 AM Daniel Borkmann  wrote:

On 1/7/21 12:40 PM, Dongseok Yi wrote:

On 2021-01-07 20:05, Daniel Borkmann wrote:

On 1/7/21 1:39 AM, Dongseok Yi wrote:

[...]

PF_PACKET handles untouched fraglist. To modify the payload only
for udp_rcv_segment, skb_unclone is necessary.


I don't parse this last sentence here, please elaborate in more detail on why
it is necessary.

For example, if tc layer would modify mark on the skb, then __copy_skb_header()
in skb_segment_list() will propagate it. If tc layer would modify payload, the
skb_ensure_writable() will take care of that internally and if needed pull in
parts from fraglist into linear section to make it private. The purpose of the
skb_clone() above iff shared is to make the struct itself private (to safely
modify its struct members). What am I missing?


If tc writes, it will call skb_make_writable and thus pskb_expand_head
to create a private linear section for the head_skb.

skb_segment_list overwrites part of the skb linear section of each
fragment itself. Even after skb_clone, the frag_skbs share their
linear section with their clone in pf_packet, so we need a
pskb_expand_head here, too.


Ok, got it, thanks for the explanation. Would be great to have it in the v3 
commit
log as well as that was more clear than the above. Too bad in that case (otoh
the pf_packet situation could be considered corner case ...); ether way, fix 
makes
sense then.


Thanks for double checking the tricky logic. Pf_packet + BPF is indeed
a peculiar corner case. But there are perhaps more, like raw sockets,
and other BPF hooks that can trigger an skb_make_writable.

Come to think of it, the no touching shared data rule is also violated
without a BPF program? Then there is nothing in the frag_skbs
themselves signifying that they are shared, but the head_skb is
cloned, so its data may still not be modified.


The skb_ensure_writable() is used in plenty of places from bpf, ovs, netfilter
to core stack (e.g. vlan, mpls, icmp), but either way there should be nothing
wrong with that as it's making sure to pull the data into its linear section
before modification. Uncareful use of skb_store_bits() with offset into 
skb_frags
for example could defeat that, but I don't see any in-tree occurrence that would
be problematic at this point..


This modification happens to be safe in practice, as the pf_packet
clones don't access the bytes before skb->data where this path inserts
headers. But still.

Re: [PATCH 0/5] virtio/vsock: introduce SOCK_SEQPACKET support.

2021-01-08 Thread Stefano Garzarella


Hi Arseny,

On Sun, Jan 03, 2021 at 10:54:52PM +0300, Arseny Krasnov wrote:

As SOCK_SEQPACKET guarantees to save record boundaries, so to
do it, new packet operation was added: it marks start of record (with
record length in header). To send record, packet with start marker is
sent first, then all data is transmitted as 'RW' packets. On receiver's
side, length of record is known from packet with start record marker.
Now as  packets of one socket are not reordered neither on vsock nor on
vhost transport layers, these marker allows to restore original record
on receiver's side. When each 'RW' packet is inserted to rx queue of
receiver, user is woken up, data is copied to user's buffer and credit
update message is sent. If there is no user waiting for data, credit
won't be updated and sender will wait. Also,  if user's buffer is full,
and record is bigger, all unneeded data will be dropped (with sending of
credit update message).
'MSG_EOR' flag is implemented with special value of 'flags' field
in packet header. When record is received with such flags, 'MSG_EOR' is
set in 'recvmsg()' flags. 'MSG_TRUNC' flag is also supported.
In this implementation maximum length of datagram is not limited
as in stream socket.


I did a a quick review. I like the idea of adding SOCK_SEQPACKET, but 
the series needs more work.

Some patches miss the SoB, the commit messages are very minimal.
Anyway I like that you shared your patches, but please use RFC tag if 
they are not ready to be merged.


Another suggestion is to move the patches that modify the core 
(af_vsock.c) before the transport modifications to make the review 
easier.


I'd also like to see new tests in tools/testing/vsock/vsock_test.c

Thanks,
Stefano

RE: [PATCH 5/9 next] scsi: Use iovec_import() instead of import_iovec().

2021-01-08 Thread David Laight

From: Christoph Hellwig
> Sent: 21 September 2020 15:22
> 
> So looking at the various callers I'm not sure this API is the
> best.  If we want to do something fancy I'd hide the struct iovec
> instances entirely with something like:
> 
> struct iov_storage {
>   struct iovec stack[UIO_FASTIOV], *vec;
> }
> 
> int iov_iter_import_iovec(struct iov_iter *iter, struct iov_storage *s,
>   const struct iovec __user *vec, unsigned long nr_segs,
>   int type);
> 
> and then add a new helper to free the thing if needed:
> 
> void iov_iter_release_iovec(struct iov_storage *s)
> {
>   if (s->vec != s->stack)
>   kfree(s->vec);
> }

I've been looking at this code again now most of the pending changes
are in Linus's tree.

I was actually looking at going one stage further.
The 'iov_iter' is always allocated with the 'iov_storage' *above).
Usually both are on the callers stack - possibly in different functions.

So add:
struct iovec_iter {
struct iov_iter iter;
struct iovec to_free;
struct iovec stack[UIO_FASTIOV];
};

int __iovec_import(struct iovec_iter *, const struct iovec __user *vec,
unsigned long nr_segs, int type, bool compat);

And a 'clean' function to do kfree(iovec->to_free);

This reduces the complexity of most of the callers.

I started doing the changes, but got in a mess in io_uring.c (as usual).
I think I've got a patch pending (in my brain) to simplify the io_uring code.

The plan is to add:
if (iter->iov != xxx->to_free)
iter->iov = xxx->stack;
Prior to every use of the iter.
This fixes up anything that got broken by a memcpy() of the fields.
The tidyup code is then always kfree(xxx->to_free).

David



-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

[PATCH net-next 2/2] net-gro: remove GRO_DROP

2021-01-08 Thread Eric Dumazet

From: Eric Dumazet 

GRO_DROP can only be returned from napi_gro_frags()
if the skb has not been allocated by a prior napi_get_frags()

Since drivers must use napi_get_frags() and test its result
before populating the skb with metadata, we can safely remove
GRO_DROP since it offers no practical use.

Signed-off-by: Eric Dumazet 
Cc: Jesse Brandeburg 
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c| 11 ---
 2 files changed, 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 
1ec3ac5d5bbffe6062216fbd4009e88d8c909fa9..5b949076ed2319fc676a7172350480efea5807d9
 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -376,7 +376,6 @@ enum gro_result {
GRO_MERGED_FREE,
GRO_HELD,
GRO_NORMAL,
-   GRO_DROP,
GRO_CONSUMED,
 };
 typedef enum gro_result gro_result_t;
diff --git a/net/core/dev.c b/net/core/dev.c
index 
7afbb642e203ad1556e96e2fc7595b6289152201..e4d77c8abe761408caf3a0d1880727f33b5134b6
 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6070,10 +6070,6 @@ static gro_result_t napi_skb_finish(struct napi_struct 
*napi,
gro_normal_one(napi, skb);
break;
 
-   case GRO_DROP:
-   kfree_skb(skb);
-   break;
-
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
@@ -6158,10 +6154,6 @@ static gro_result_t napi_frags_finish(struct napi_struct 
*napi,
gro_normal_one(napi, skb);
break;
 
-   case GRO_DROP:
-   napi_reuse_skb(napi, skb);
-   break;
-
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
@@ -6223,9 +6215,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
gro_result_t ret;
struct sk_buff *skb = napi_frags_skb(napi);
 
-   if (!skb)
-   return GRO_DROP;
-
trace_napi_gro_frags_entry(skb);
 
ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
-- 
2.29.2.729.g45daf8777d-goog

[PATCH net-next 0/2] net-gro: GRO_DROP deprecation

2021-01-08 Thread Eric Dumazet

From: Eric Dumazet 

GRO_DROP has no practical use and can be removed,
once ice driver is cleaned up.

This removes one useless conditionel test in napi_gro_frags()

Eric Dumazet (2):
  ice: drop dead code in ice_receive_skb()
  net-gro: remove GRO_DROP

 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  7 +--
 include/linux/netdevice.h |  1 -
 net/core/dev.c| 11 ---
 3 files changed, 1 insertion(+), 18 deletions(-)

-- 
2.29.2.729.g45daf8777d-goog

[PATCH net-next 1/2] ice: drop dead code in ice_receive_skb()

2021-01-08 Thread Eric Dumazet

From: Eric Dumazet 

napi_gro_receive() can never return GRO_DROP

GRO_DROP can only be returned from napi_gro_frags()
which is the other NAPI GRO entry point.

Followup patch will remove GRO_DROP, because drivers
are not supposed to call napi_gro_frags() if prior
napi_get_frags() has failed.

Note that I have left the gro_dropped variable. I leave to ice
maintainers the decision to further remove it from ethtool -S results.

Signed-off-by: Eric Dumazet 
Cc: Jesse Brandeburg 
---
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c 
b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 
bc2f4390b51dc67b2ef8cf929a6451a0259e3d51..02b12736ea80d4551d3d3460339eb0706941d22f
 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -191,12 +191,7 @@ ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff 
*skb, u16 vlan_tag)
if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
(vlan_tag & VLAN_VID_MASK))
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-   if (napi_gro_receive(&rx_ring->q_vector->napi, skb) == GRO_DROP) {
-   /* this is tracked separately to help us debug stack drops */
-   rx_ring->rx_stats.gro_dropped++;
-   netdev_dbg(rx_ring->netdev, "Receive Queue %d: Dropped packet 
from GRO\n",
-  rx_ring->q_index);
-   }
+   napi_gro_receive(&rx_ring->q_vector->napi, skb);
 }
 
 /**
-- 
2.29.2.729.g45daf8777d-goog

[PATCH net] netfilter: conntrack: fix reading nf_conntrack_buckets

2021-01-08 Thread Jesper Dangaard Brouer

The old way of changing the conntrack hashsize runtime was through changing
the module param via file /sys/module/nf_conntrack/parameters/hashsize. This
was extended to sysctl change in commit 3183ab8997a4 ("netfilter: conntrack:
allow increasing bucket size via sysctl too").

The commit introduced second "user" variable nf_conntrack_htable_size_user
which shadow actual variable nf_conntrack_htable_size. When hashsize is
changed via module param this "user" variable isn't updated. This results in
sysctl net/netfilter/nf_conntrack_buckets shows the wrong value when users
update via the old way.

This patch fix the issue by always updating "user" variable when reading the
proc file. This will take care of changes to the actual variable without
sysctl need to be aware.

Fixes: 3183ab8997a4 ("netfilter: conntrack: allow increasing bucket size via 
sysctl too")
Reported-by: Yoel Caspersen 
Signed-off-by: Jesper Dangaard Brouer 
---
 net/netfilter/nf_conntrack_standalone.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nf_conntrack_standalone.c 
b/net/netfilter/nf_conntrack_standalone.c
index 46c5557c1fec..0ee702d374b0 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -523,6 +523,9 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
 {
int ret;
 
+   /* module_param hashsize could have changed value */
+   nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret < 0 || !write)
return ret;

RE: [PATCH 05/11] iov_iter: merge the compat case into rw_copy_check_uvector

2021-01-08 Thread David Laight

From: Christoph Hellwig 
> Sent: 21 September 2020 15:34
> 
> Stop duplicating the iovec verify code, and instead add add a
> __import_iovec helper that does the whole verify and import, but takes
> a bool compat to decided on the native or compat layout.  This also
> ends up massively simplifying the calling conventions.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  lib/iov_iter.c | 195 ++---
>  1 file changed, 70 insertions(+), 125 deletions(-)
> 
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index a64867501a7483..8bfa47b63d39aa 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #define PIPE_PARANOIA /* for now */
> 
> @@ -1650,43 +1651,76 @@ const void *dup_iter(struct iov_iter *new, struct 
> iov_iter *old, gfp_t flags)
>  }
>  EXPORT_SYMBOL(dup_iter);
> 
> -static ssize_t rw_copy_check_uvector(int type,
> - const struct iovec __user *uvector, unsigned long nr_segs,
> - unsigned long fast_segs, struct iovec *fast_pointer,
> - struct iovec **ret_pointer)
> +static int compat_copy_iovecs_from_user(struct iovec *iov,
> + const struct iovec __user *uvector, unsigned long nr_segs)
> +{
> + const struct compat_iovec __user *uiov =
> + (const struct compat_iovec __user *)uvector;
> + unsigned long i;
> + int ret = -EFAULT;
> +
> + if (!user_access_begin(uvector, nr_segs * sizeof(*uvector)))
> + return -EFAULT;

I little bit late, but the above isn't quite right.
It should be sizeof(*iouv) - the length is double what it should be.

Not that access_ok() can fail for compat addresses
and the extra length won't matter for architectures that
need the address/length to open an address hole into userspace.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

Re: [PATCH net] netfilter: conntrack: fix reading nf_conntrack_buckets

2021-01-08 Thread Florian Westphal

Jesper Dangaard Brouer  wrote:
> The old way of changing the conntrack hashsize runtime was through changing
> the module param via file /sys/module/nf_conntrack/parameters/hashsize. This
> was extended to sysctl change in commit 3183ab8997a4 ("netfilter: conntrack:
> allow increasing bucket size via sysctl too").
> 
> The commit introduced second "user" variable nf_conntrack_htable_size_user
> which shadow actual variable nf_conntrack_htable_size. When hashsize is
> changed via module param this "user" variable isn't updated. This results in
> sysctl net/netfilter/nf_conntrack_buckets shows the wrong value when users
> update via the old way.

Oh, right!

Acked-by: Florian Westphal

[PATCH net-next v2 0/3] r8169: small improvements

2021-01-08 Thread Heiner Kallweit

This series includes a number of smaller improvements.

v2:
- return on WARN in patch 1

Heiner Kallweit (3):
  r8169: replace BUG_ON with WARN in _rtl_eri_write
  r8169: improve rtl_ocp_reg_failure
  r8169: don't wakeup-enable device on shutdown if WOL is disabled

 drivers/net/ethernet/realtek/r8169_main.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

-- 
2.30.0

[PATCH net-next v2 1/3] r8169: replace BUG_ON with WARN in _rtl_eri_write

2021-01-08 Thread Heiner Kallweit

Use WARN here to avoid stopping the system. In addition print the addr
and mask values that triggered the warning.

v2:
- return on WARN to avoid an invalid register write

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c 
b/drivers/net/ethernet/realtek/r8169_main.c
index c9abc7ccb..317b34723 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -763,7 +763,9 @@ static void _rtl_eri_write(struct rtl8169_private *tp, int 
addr, u32 mask,
 {
u32 cmd = ERIAR_WRITE_CMD | type | mask | addr;
 
-   BUG_ON((addr & 3) || (mask == 0));
+   if (WARN(addr & 3 || !mask, "addr: 0x%x, mask: 0x%08x\n", addr, mask))
+   return;
+
RTL_W32(tp, ERIDR, val);
r8168fp_adjust_ocp_cmd(tp, &cmd, type);
RTL_W32(tp, ERIAR, cmd);
-- 
2.30.0

[PATCH net-next v2 3/3] r8169: don't wakeup-enable device on shutdown if WOL is disabled

2021-01-08 Thread Heiner Kallweit

If WOL isn't enabled, then there's no need to enable wakeup from D3
on system shutdown.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c 
b/drivers/net/ethernet/realtek/r8169_main.c
index 6005d37b6..982e6b2f0 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4876,7 +4876,7 @@ static void rtl_shutdown(struct pci_dev *pdev)
rtl_wol_shutdown_quirk(tp);
}
 
-   pci_wake_from_d3(pdev, true);
+   pci_wake_from_d3(pdev, tp->saved_wolopts);
pci_set_power_state(pdev, PCI_D3hot);
}
 }
-- 
2.30.0

[PATCH net-next v2 2/3] r8169: improve rtl_ocp_reg_failure

2021-01-08 Thread Heiner Kallweit

Use WARN_ONCE here to get a call trace in case of a problem.
This facilitates finding the offending code part.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169_main.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c 
b/drivers/net/ethernet/realtek/r8169_main.c
index 9af048ad0..6005d37b6 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -810,14 +810,9 @@ static void rtl_eri_clear_bits(struct rtl8169_private *tp, 
int addr, u32 m)
rtl_w0w1_eri(tp, addr, 0, m);
 }
 
-static bool rtl_ocp_reg_failure(struct rtl8169_private *tp, u32 reg)
+static bool rtl_ocp_reg_failure(u32 reg)
 {
-   if (reg & 0x0001) {
-   if (net_ratelimit())
-   netdev_err(tp->dev, "Invalid ocp reg %x!\n", reg);
-   return true;
-   }
-   return false;
+   return WARN_ONCE(reg & 0x0001, "Invalid ocp reg %x!\n", reg);
 }
 
 DECLARE_RTL_COND(rtl_ocp_gphy_cond)
@@ -827,7 +822,7 @@ DECLARE_RTL_COND(rtl_ocp_gphy_cond)
 
 static void r8168_phy_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 {
-   if (rtl_ocp_reg_failure(tp, reg))
+   if (rtl_ocp_reg_failure(reg))
return;
 
RTL_W32(tp, GPHY_OCP, OCPAR_FLAG | (reg << 15) | data);
@@ -837,7 +832,7 @@ static void r8168_phy_ocp_write(struct rtl8169_private *tp, 
u32 reg, u32 data)
 
 static int r8168_phy_ocp_read(struct rtl8169_private *tp, u32 reg)
 {
-   if (rtl_ocp_reg_failure(tp, reg))
+   if (rtl_ocp_reg_failure(reg))
return 0;
 
RTL_W32(tp, GPHY_OCP, reg << 15);
@@ -848,7 +843,7 @@ static int r8168_phy_ocp_read(struct rtl8169_private *tp, 
u32 reg)
 
 static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 {
-   if (rtl_ocp_reg_failure(tp, reg))
+   if (rtl_ocp_reg_failure(reg))
return;
 
RTL_W32(tp, OCPDR, OCPAR_FLAG | (reg << 15) | data);
@@ -856,7 +851,7 @@ static void r8168_mac_ocp_write(struct rtl8169_private *tp, 
u32 reg, u32 data)
 
 static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
 {
-   if (rtl_ocp_reg_failure(tp, reg))
+   if (rtl_ocp_reg_failure(reg))
return 0;
 
RTL_W32(tp, OCPDR, reg << 15);
-- 
2.30.0

[PATCH net-next] r8169: deprecate support for RTL_GIGA_MAC_VER_27

2021-01-08 Thread Heiner Kallweit

RTL8168dp is ancient anyway, and I haven't seen any trace of its early
version 27 yet. This chip versions needs quite some special handling,
therefore it would facilitate driver maintenance if support for it
could be dropped. For now just disable detection of this chip version.
If nobody complains we can remove support for it in the near future.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169_main.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c 
b/drivers/net/ethernet/realtek/r8169_main.c
index e72d8f3ae..f94965615 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -1962,7 +1962,11 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, 
bool gmii)
{ 0x7c8, 0x280, RTL_GIGA_MAC_VER_26 },
 
/* 8168DP family. */
-   { 0x7cf, 0x288, RTL_GIGA_MAC_VER_27 },
+   /* It seems this early RTL8168dp version never made it to
+* the wild. Let's see whether somebody complains, if not
+* we'll remove support for this chip version completely.
+* { 0x7cf, 0x288,  RTL_GIGA_MAC_VER_27 },
+*/
{ 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28 },
{ 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31 },
 
-- 
2.29.2

Re: [PATCH net-next 2/2] net-gro: remove GRO_DROP

2021-01-08 Thread Edward Cree

On 08/01/2021 11:39, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> GRO_DROP can only be returned from napi_gro_frags()
> if the skb has not been allocated by a prior napi_get_frags()
> 
> Since drivers must use napi_get_frags() and test its result
> before populating the skb with metadata, we can safely remove
> GRO_DROP since it offers no practical use.
> 
> Signed-off-by: Eric Dumazet 
> Cc: Jesse Brandeburg 

Fwiw,
Acked-by: Edward Cree

Здравствуйте,

2021-01-08 Thread camille

Приветствую тебя, мой друг, надеюсь, ты в порядке, пожалуйста, ответь мне
благодаря,

[RFC PATCH net] udp: check sk for UDP GRO fraglist

2021-01-08 Thread Dongseok Yi

It is a workaround patch.

UDP/IP header of UDP GROed frag_skbs are not updated even after NAT
forwarding. Only the header of head_skb from ip_finish_output_gso ->
skb_gso_segment is updated but following frag_skbs are not updated.

A call path skb_mac_gso_segment -> inet_gso_segment ->
udp4_ufo_fragment -> __udp_gso_segment -> __udp_gso_segment_list
does not try to update any UDP/IP header of the segment list.

It might make sense because each skb of frag_skbs is converted to a
list of regular packets. Header update with checksum calculation may
be not needed for UDP GROed frag_skbs.

But UDP GRO frag_list is started from udp_gro_receive, we don't know
whether the skb will be NAT forwarded at that time. For workaround,
try to get sock always when call udp4_gro_receive -> udp_gro_receive
to check if the skb is for local.

I'm still not sure if UDP GRO frag_list is really designed for local
session only. Can kernel support NAT forward for UDP GRO frag_list?
What am I missing?

Fixes: 9fd1ff5d2ac7 (udp: Support UDP fraglist GRO/GSO.)
Signed-off-by: Dongseok Yi 
---
 net/ipv4/udp_offload.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ff39e94..d476216 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -457,7 +457,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, 
struct sk_buff *skb,
int flush = 1;
 
NAPI_GRO_CB(skb)->is_flist = 0;
-   if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+   if (sk && (skb->dev->features & NETIF_F_GRO_FRAGLIST))
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
 
if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
@@ -537,8 +537,7 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, 
struct sk_buff *skb)
NAPI_GRO_CB(skb)->is_ipv6 = 0;
rcu_read_lock();
 
-   if (static_branch_unlikely(&udp_encap_needed_key))
-   sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
+   sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
 
pp = udp_gro_receive(head, skb, uh, sk);
rcu_read_unlock();
-- 
2.7.4

Re: 5.10.4+ hang with 'rmmod nf_conntrack'

2021-01-08 Thread Ben Greear


On 1/7/21 10:16 PM, Florian Westphal wrote:

Ben Greear  wrote:

I noticed my system has a hung process trying to 'rmmod nf_conntrack'.

I've generally been doing the script that calls rmmod forever,
but only extensively tested on 5.4 kernel and earlier.

If anyone has any ideas, please let me know.  This is from 'sysrq t'.  I don't 
see
any hung-task splats in dmesg.


rmmod on conntrack loops forever until the active conntrack object count 
reaches 0.
(plus a walk of the conntrack table to evict/put all entries).


I'll see if it is reproducible and if so will try
with lockdep enabled...


No idea, there was a regression in 5.6, but that was fixed by the time
5.7 was released.

Can't reproduce hangs with a script that injects a few dummy entries
and then removes the module:

added=0

add_and_rmmod()
{
 while [ $added -lt 1000 ]; do
 conntrack -I -s 
$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%255+1)) \
 -d 
$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%256)).$(($RANDOM%255+1)) \
  --protonum 6 --timeout $(((RANDOM%120) + 240)) --state 
ESTABLISHED --sport $RANDOM --dport $RANDOM 2> /dev/null || break

 added=$((added + 1))
 if [ $((added % 1000)) -eq 0 ];then
 echo $added
 fi
 done

 echo rmmod after adding $added entries
 conntrack -C
 rmmod nf_conntrack_netlink
 rmmod nf_conntrack
}

add_and_rmmod

I don't see how it would make a difference, but do you have any special 
conntrack features enabled
at run time, e.g. reliable netlink events? (If you don't know what I mean the 
answer is no).


Not that I know of, but I am using lots of VRF devices, each with their own 
routing table, as well
as some wifi stations and AP netdevs.

I'll let you know if I can reproduce it again..

Thanks,
Ben


--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com

Re: Flaw in "random32: update the net random state on interrupt and activity"

2021-01-08 Thread Sedat Dilek

On Wed, Aug 12, 2020 at 6:25 PM Eric Dumazet  wrote:

> > Also, I tried the diff for tcp_conn_request...
> > With removing the call to prandom_u32() not useful for
> > prandom_u32/tracing via perf.
>
> I am planning to send the TCP patch once net-next is open. (probably next 
> week)

Ping.

What is the status of this?

- Sedat -

Re: [PATCH iproute2 v2 1/1] build: Fix link errors on some systems

2021-01-08 Thread Petr Machata



Stephen Hemminger  writes:

> On Thu,  7 Jan 2021 09:13:34 +0200
> Roi Dayan  wrote:
>
>> +#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) && 
>> is_json_context())
>> +#define _IS_FP_CONTEXT(type) (!is_json_context() && (type & PRINT_FP || 
>> type & PRINT_ANY))
>
> You could fold the comparisons? and why are the two options doing comparison 
> in different order?
>
> #define _IS_JSON_CONTEXT(type) (is_json_context() && (type & (PRINT_JSON | 
> PRINT_ANY))
> #define _IS_FP_CONTEXT(type)   (!is_json_context() && (type & (PRINT_FP || 
> PRINT_ANY))

(s/||/|/)

Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option

2021-01-08 Thread Jiri Pirko

Fri, Jan 08, 2021 at 12:58:13AM CET, ja...@redhat.com wrote:
>On Mon, Dec 28, 2020 at 11:11:45AM +0100, Jiri Pirko wrote:
>> Fri, Dec 18, 2020 at 08:30:33PM CET, ja...@redhat.com wrote:
>> >This comes from an end-user request, where they're running multiple VMs on
>> >hosts with bonded interfaces connected to some interest switch topologies,
>> >where 802.3ad isn't an option. They're currently running a proprietary
>> >solution that effectively achieves load-balancing of VMs and bandwidth
>> >utilization improvements with a similar form of transmission algorithm.
>> >
>> >Basically, each VM has it's own vlan, so it always sends its traffic out
>> >the same interface, unless that interface fails. Traffic gets split
>> >between the interfaces, maintaining a consistent path, with failover still
>> >available if an interface goes down.
>> >
>> >This has been rudimetarily tested to provide similar results, suitable for
>> >them to use to move off their current proprietary solution.
>> >
>> >Still on the TODO list, if these even looks sane to begin with, is
>> >fleshing out Documentation/networking/bonding.rst.
>> 
>> Jarod, did you consider using team driver instead ? :)
>
>That's actually one of the things that was suggested, since team I believe
>already has support for this, but the user really wants to use bonding.
>We're finding that a lot of users really still prefer bonding over team.

Do you know the reason, other than "nostalgia"?

[PATCH net-next] devlink: fix return of uninitialized variable err

2021-01-08 Thread wangyunjian

From: Yunjian Wang 

There is a potential execution path in which variable err is
returned without being properly initialized previously. Fix
this by initializing variable err to 0.

Addresses-Coverity: ("Uninitialized scalar variable")
Fixes: 1db64e8733f6 ("devlink: Add devlink formatted message (fmsg) API")
Signed-off-by: Yunjian Wang 
---
 net/core/devlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index ee828e4b1007..470215cd60b5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -5699,7 +5699,7 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, 
struct sk_buff *skb,
struct devlink_fmsg_item *item;
struct nlattr *fmsg_nlattr;
int i = 0;
-   int err;
+   int err = 0;
 
fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG);
if (!fmsg_nlattr)
-- 
2.23.0

Re: [RFC v2 06/13] vduse: Introduce VDUSE - vDPA Device in Userspace

2021-01-08 Thread Bob Liu

On 12/22/20 10:52 PM, Xie Yongji wrote:
> This VDUSE driver enables implementing vDPA devices in userspace.
> Both control path and data path of vDPA devices will be able to
> be handled in userspace.
> 
> In the control path, the VDUSE driver will make use of message
> mechnism to forward the config operation from vdpa bus driver
> to userspace. Userspace can use read()/write() to receive/reply
> those control messages.
> 
> In the data path, the VDUSE driver implements a MMU-based on-chip
> IOMMU driver which supports mapping the kernel dma buffer to a
> userspace iova region dynamically. Userspace can access those
> iova region via mmap(). Besides, the eventfd mechanism is used to
> trigger interrupt callbacks and receive virtqueue kicks in userspace
> 
> Now we only support virtio-vdpa bus driver with this patch applied.
> 
> Signed-off-by: Xie Yongji 
> ---
>  Documentation/driver-api/vduse.rst |   74 ++
>  Documentation/userspace-api/ioctl/ioctl-number.rst |1 +
>  drivers/vdpa/Kconfig   |8 +
>  drivers/vdpa/Makefile  |1 +
>  drivers/vdpa/vdpa_user/Makefile|5 +
>  drivers/vdpa/vdpa_user/eventfd.c   |  221 
>  drivers/vdpa/vdpa_user/eventfd.h   |   48 +
>  drivers/vdpa/vdpa_user/iova_domain.c   |  442 
>  drivers/vdpa/vdpa_user/iova_domain.h   |   93 ++
>  drivers/vdpa/vdpa_user/vduse.h |   59 ++
>  drivers/vdpa/vdpa_user/vduse_dev.c | 1121 
> 
>  include/uapi/linux/vdpa.h  |1 +
>  include/uapi/linux/vduse.h |   99 ++
>  13 files changed, 2173 insertions(+)
>  create mode 100644 Documentation/driver-api/vduse.rst
>  create mode 100644 drivers/vdpa/vdpa_user/Makefile
>  create mode 100644 drivers/vdpa/vdpa_user/eventfd.c
>  create mode 100644 drivers/vdpa/vdpa_user/eventfd.h
>  create mode 100644 drivers/vdpa/vdpa_user/iova_domain.c
>  create mode 100644 drivers/vdpa/vdpa_user/iova_domain.h
>  create mode 100644 drivers/vdpa/vdpa_user/vduse.h
>  create mode 100644 drivers/vdpa/vdpa_user/vduse_dev.c
>  create mode 100644 include/uapi/linux/vduse.h
> 
> diff --git a/Documentation/driver-api/vduse.rst 
> b/Documentation/driver-api/vduse.rst
> new file mode 100644
> index ..da9b3040f20a
> --- /dev/null
> +++ b/Documentation/driver-api/vduse.rst
> @@ -0,0 +1,74 @@
> +==
> +VDUSE - "vDPA Device in Userspace"
> +==
> +
> +vDPA (virtio data path acceleration) device is a device that uses a
> +datapath which complies with the virtio specifications with vendor
> +specific control path. vDPA devices can be both physically located on
> +the hardware or emulated by software. VDUSE is a framework that makes it
> +possible to implement software-emulated vDPA devices in userspace.
> +

Could you explain a bit more why need a VDUSE framework?
Software emulated vDPA devices is more likely used by debugging only when
don't have real hardware.
Do you think do the emulation in kernel space is not enough?

Thanks,
Bob

> +How VDUSE works
> +
> +Each userspace vDPA device is created by the VDUSE_CREATE_DEV ioctl on
> +the VDUSE character device (/dev/vduse). Then a file descriptor pointing
> +to the new resources will be returned, which can be used to implement the
> +userspace vDPA device's control path and data path.
> +
> +To implement control path, the read/write operations to the file descriptor
> +will be used to receive/reply the control messages from/to VDUSE driver.
> +Those control messages are based on the vdpa_config_ops which defines a
> +unified interface to control different types of vDPA device.
> +

Re: [RFC PATCH net] udp: check sk for UDP GRO fraglist

2021-01-08 Thread Steffen Klassert

On Fri, Jan 08, 2021 at 09:52:28PM +0900, Dongseok Yi wrote:
> It is a workaround patch.
> 
> UDP/IP header of UDP GROed frag_skbs are not updated even after NAT
> forwarding. Only the header of head_skb from ip_finish_output_gso ->
> skb_gso_segment is updated but following frag_skbs are not updated.
> 
> A call path skb_mac_gso_segment -> inet_gso_segment ->
> udp4_ufo_fragment -> __udp_gso_segment -> __udp_gso_segment_list
> does not try to update any UDP/IP header of the segment list.
> 
> It might make sense because each skb of frag_skbs is converted to a
> list of regular packets. Header update with checksum calculation may
> be not needed for UDP GROed frag_skbs.
> 
> But UDP GRO frag_list is started from udp_gro_receive, we don't know
> whether the skb will be NAT forwarded at that time. For workaround,
> try to get sock always when call udp4_gro_receive -> udp_gro_receive
> to check if the skb is for local.
> 
> I'm still not sure if UDP GRO frag_list is really designed for local
> session only. Can kernel support NAT forward for UDP GRO frag_list?
> What am I missing?

The initial idea when I implemented this was to have a fast
forwarding path for UDP. So forwarding is a usecase, but NAT
is a problem, indeed. A quick fix could be to segment the
skb before it gets NAT forwarded. Alternatively we could
check for a header change in __udp_gso_segment_list and
update the header of the frag_skbs accordingly in that case.

Re: MDIO over I2C driver driver probe dependency issue

2021-01-08 Thread Andrew Lunn

On Thu, Jan 07, 2021 at 07:05:21PM -0800, Florian Fainelli wrote:
> 
> 
> On 1/7/2021 6:22 PM, Brian Silverman wrote:
> > I've written a very small generic MDIO driver that uses the existing
> > mdio-i2c.c library in drivers/net/phy.  The driver allows
> > communication to the PHY's MDIO interface as using I2C, as supported by
> > PHYs like the BCM54616S.  This is working on my hardware.  
> > 
> > The one issue I have is that I2C is not up and available (i.e. probed)
> > at the time that the MDIO interface comes up.  To fix this, I've changed
> > the device order in drivers/Makefile to put "obj-y += i2c/"
> > before "obj-y += net/".
> > 
> > While that works, I prefer not to have to keep that difference from
> > mainline Linux.  Also, I don't understand why i2c drivers occur
> > arbitrarily late in the Makefile - surely there are other devices
> > drivers that need i2c to be enabled when they are probed?
> > 
> > Is there a way to do this that doesn't change probe order?  Or is there
> > a way to change probe order without patching mainline Linux?
> 
> Linux supports probe deferral so when a consumer of a resource finds
> that said resource's provider is not available, it should return
> -EPROBE_DEFER which puts the driver's probe routine onto a list of
> driver's probe function to retry at a later time.
> 
> In your case the GEM Ethernet driver should get an -EPROBE_DEFER while
> the Ethernet PHY device tree node is looked up via
> phylink_of_phy_connect() because the mdio-i2c-gen i2c client has not had
> a chance to register the MDIO bus yet. Have you figured out the call
> path that does not work for you?

Just adding to this. The way the current mdio-i2c code is used, we
have already talked to the SFP to get its EEPROM contents. So we know
I2C works at the time the mdio-i2c bus is instantiated.

You are using the code slightly differently which might be why the
code is not correctly handling EPROBE_DEFER where it should. Patches
to add this are likely to be accepted.

   Andrew

Re: [PATCH] bus: mhi: Add inbound buffers allocation flag

2021-01-08 Thread Manivannan Sadhasivam

On Wed, Jan 06, 2021 at 02:43:43PM +0100, Loic Poulain wrote:
> Currently, the MHI controller driver defines which channels should
> have their inbound buffers allocated and queued. But ideally, this is
> something that should be decided by the MHI device driver instead,

We call them, "MHI client drivers"

> which actually deals with that buffers.
> 
> Add a flag parameter to mhi_prepare_for_transfer allowing to specify
> if buffers have to be allocated and queued by the MHI stack.
> 
> Keep auto_queue flag for now, but should be removed at some point.
> 
> Signed-off-by: Loic Poulain 
> ---
>  drivers/bus/mhi/core/internal.h |  2 +-
>  drivers/bus/mhi/core/main.c | 11 ---
>  drivers/net/mhi_net.c   |  2 +-
>  include/linux/mhi.h | 12 +++-
>  net/qrtr/mhi.c  |  2 +-
>  5 files changed, 22 insertions(+), 7 deletions(-)
> 

[...]

> diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c
> index fa41d8c..b7f7f2e 100644
> --- a/drivers/net/mhi_net.c
> +++ b/drivers/net/mhi_net.c
> @@ -265,7 +265,7 @@ static int mhi_net_probe(struct mhi_device *mhi_dev,
>   u64_stats_init(&mhi_netdev->stats.tx_syncp);
>  
>   /* Start MHI channels */
> - err = mhi_prepare_for_transfer(mhi_dev);
> + err = mhi_prepare_for_transfer(mhi_dev, 0);

Eventhough I'd like Hemant to comment on this patch, AFAIU this looks to
me a controller dependent behaviour. The controller should have the
information whether a particular channel can auto queue or not then the
client driver can be agnostic.

>   if (err)
>   goto out_err;
>  
> diff --git a/include/linux/mhi.h b/include/linux/mhi.h
> index 209b335..6723339 100644
> --- a/include/linux/mhi.h
> +++ b/include/linux/mhi.h
> @@ -60,6 +60,14 @@ enum mhi_flags {
>  };
>  
>  /**
> + * enum mhi_chan_flags - MHI channel flags
> + * @MHI_CH_INBOUND_ALLOC_BUFS: Automatically allocate and queue inbound 
> buffers
> + */
> +enum mhi_chan_flags {
> + MHI_CH_INBOUND_ALLOC_BUFS = BIT(0),
> +};
> +
> +/**
>   * enum mhi_device_type - Device types
>   * @MHI_DEVICE_XFER: Handles data transfer
>   * @MHI_DEVICE_CONTROLLER: Control device
> @@ -705,8 +713,10 @@ void mhi_device_put(struct mhi_device *mhi_dev);
>  /**
>   * mhi_prepare_for_transfer - Setup channel for data transfer
>   * @mhi_dev: Device associated with the channels
> + * @flags: MHI channel flags
>   */
> -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
> +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev,
> +  enum mhi_chan_flags flags);
>  
>  /**
>   * mhi_unprepare_from_transfer - Unprepare the channels
> diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c
> index 2bf2b19..47afded 100644
> --- a/net/qrtr/mhi.c
> +++ b/net/qrtr/mhi.c
> @@ -77,7 +77,7 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev,
>   int rc;
>  
>   /* start channels */
> - rc = mhi_prepare_for_transfer(mhi_dev);
> + rc = mhi_prepare_for_transfer(mhi_dev, MHI_CH_INBOUND_ALLOC_BUFS);

Are you sure it requires auto queued channel?

Thanks,
Mani

>   if (rc)
>   return rc;
>  
> -- 
> 2.7.4
>

Re: [net-next PATCH v13 2/4] net: phy: Add 5GBASER interface mode

2021-01-08 Thread Andrew Lunn

On Fri, Jan 08, 2021 at 07:49:28PM +1000, Pavana Sharma wrote:
> Add 5GBASE-R phy interface mode
> 
> Signed-off-by: Pavana Sharma 

Reviewed-by: Andrew Lunn 

Andrew

Re: [net-next PATCH v13 4/4] net: dsa: mv88e6xxx: Add support for mv88e6393x family of Marvell

2021-01-08 Thread Marek Behún

Pavana, some last nitpicks, sorry I didn't check this before.

I will send another patch to you which shall fix all this things.

On Fri,  8 Jan 2021 19:50:56 +1000
Pavana Sharma  wrote:

> +/* Support 10, 100, 200, 1000, 2500, 5000, 1 Mbps (e.g. 88E6393X)
> + * This function adds new speed 5000 supported by Amethyst family.
> + * Function mv88e6xxx_port_set_speed_duplex() can't be used as the register
> + * values for speeds 2500 & 5000 conflict.
> + */
> +
> +int mv88e6393x_port_set_speed_duplex(struct mv88e6xxx_chip *chip, int port,
> + int speed, int duplex)

There are basically 2 empty lines between the comment and the function
signature, one containting only comment end token " */" and the other
truly empty. I think you can remove the empty line, since the comment
is already visibly separated from the function body.

I think you can also remove the second line of the comment:
  "This function adds new speed 5000 supported by Amethyst family."
The alignment of the speed argument is wrong. It should start at the
same column as the first argument.

So:

/* Support 10, 100, 200, 1000, 2500, 5000, 1 Mbps (e.g. 88E6393X)
 * Function mv88e6xxx_port_set_speed_duplex() can't be used as the register
 * values for speeds 2500 & 5000 conflict.
 */
int mv88e6393x_port_set_speed_duplex(struct mv88e6xxx_chip *chip, int port,
 int speed, int duplex)

> + reg &= ~(MV88E6XXX_PORT_MAC_CTL_SPEED_MASK |
> + MV88E6390_PORT_MAC_CTL_ALTSPEED |
> + MV88E6390_PORT_MAC_CTL_FORCE_SPEED);

alignment:

reg &= ~(MV88E6XXX_PORT_MAC_CTL_SPEED_MASK |
 MV88E6390_PORT_MAC_CTL_ALTSPEED |
 MV88E6390_PORT_MAC_CTL_FORCE_SPEED);

> +static int mv88e6393x_port_policy_write(struct mv88e6xxx_chip *chip, u16 
> pointer,
> + u8 data)
> +{
> +
> + int err = 0;

Unneeded empty line before int err = 0;
Also alignment of the data argument:

static int mv88e6393x_port_policy_write(struct mv88e6xxx_chip *chip, u16 
pointer,
u8 data)

(It seems to me that you only align with tabs. If you need to align for
 example to 20th column, use 2 tabs and 4 spaces (2*8 + 4 = 20).)

> +int mv88e6393x_port_set_ether_type(struct mv88e6xxx_chip *chip, int port,
> + u16 etype)

Alignment:
int mv88e6393x_port_set_ether_type(struct mv88e6xxx_chip *chip, int port,
   u16 etype)

> +#define MV88E6XXX_PORT_STS_CMODE_5GBASER 0x000c
> +#define MV88E6XXX_PORT_STS_CMODE_10GBASER0x000d
> +#define MV88E6XXX_PORT_STS_CMODE_USXGMII 0x000e

These macros should have prefix
  MV88E6393X_
instead of
  MV88E6XXX_
since these values apply for 6393X family and they conflict with the
values for other switches.

> +int mv88e6xxx_port_wait_bit(struct mv88e6xxx_chip *chip, int port, int reg,
> + int bit, int val);

Alignment.

> +int mv88e6393x_port_set_speed_duplex(struct mv88e6xxx_chip *chip, int port,
> + int speed, int duplex);

Alignment.

> +int mv88e6393x_port_set_ether_type(struct mv88e6xxx_chip *chip, int port,
> + u16 etype);

Alignment.

> +/* Only Ports 0, 9 and 10 have SERDES lanes. Return the SERDES lane address
> + * a port is using else Returns -ENODEV.
> + */
> +int mv88e6393x_serdes_get_lane(struct mv88e6xxx_chip *chip, int port)
> +{
> + u8 cmode = chip->ports[port].cmode;
> + int lane = -ENODEV;
> +
> + if (port != 0 && port != 9 && port != 10)
> + return -EOPNOTSUPP;
> +
> + if (cmode == MV88E6XXX_PORT_STS_CMODE_1000BASEX ||
> + cmode == MV88E6XXX_PORT_STS_CMODE_SGMII ||
> + cmode == MV88E6XXX_PORT_STS_CMODE_2500BASEX ||
> + cmode == MV88E6XXX_PORT_STS_CMODE_5GBASER ||
> + cmode == MV88E6XXX_PORT_STS_CMODE_10GBASER ||
> + cmode == MV88E6XXX_PORT_STS_CMODE_USXGMII)

Alignment.

> + lane = port;
> + return lane;
> +}

> +int mv88e6393x_serdes_irq_enable(struct mv88e6xxx_chip *chip, int port,
> + int lane, bool enable)

Alignment.

> +irqreturn_t mv88e6393x_serdes_irq_status(struct mv88e6xxx_chip *chip, int 
> port,
> + int lane)

Alignment.

> +static int mv88e6393x_serdes_port_config(struct mv88e6xxx_chip *chip, int 
> lane,
> + bool on)

Alignment.

> + mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS,
> + MV88E6393X_SERDES_POC, &config);
> + config &= ~(MV88E6393X_SERDES_POC_PCS_MODE_MASK |
> + MV88E6393X_SERDES_POC_PDOWN);
> + config |= pcs;
> + mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS,
> + MV88E6393X_SERDES_POC, config);
> + config |= MV88E6393X_SERDES_POC_

Re: [net-next PATCH v13 1/4] dt-bindings: net: Add 5GBASER phy interface

2021-01-08 Thread Andrew Lunn

On Fri, Jan 08, 2021 at 07:48:59PM +1000, Pavana Sharma wrote:
> Add 5gbase-r PHY interface mode.
> 
> Signed-off-by: Pavana Sharma 
> Acked-by: Rob Herring 

Reviewed-by: Andrew Lunn 

Andrew

[net-next PATCH v13 4/4] net: dsa: mv88e6xxx: Add support for mv88e6393x family of Marvell

2021-01-08 Thread Pavana Sharma

The Marvell 88E6393X device is a single-chip integration of a 11-port
Ethernet switch with eight integrated Gigabit Ethernet (GbE) transceivers
and three 10-Gigabit interfaces.

This patch adds functionalities specific to mv88e6393x family (88E6393X,
88E6193X and 88E6191X)

Co-developed-by: Marek Behún 
Signed-off-by: Marek Behún 
Co-developed-by: Ashkan Boldaji 
Signed-off-by: Ashkan Boldaji 
Signed-off-by: Pavana Sharma 
---
Changes in v2:
  - Fix a warning (Reported-by: kernel test robot )
Changes in v3:
  - Fix 'unused function' warning
Changes in v4-v9:
  - Incorporated feedback from maintainers.
Changes in v10:
  - Fix ISO C90 forbids mixing declarations and code warning
Changes in v11:
  - Add comment for clarity, regarding configuring speed 5000 (supported
by mv88e6393x family)
Changes in v12:
  - Rebase to net-next
  - Remove 5GBASE-R comments from patch 1 & 2 of this patchset
  - Make function name to the convention
Changes in v13:
  - Rebase to latest.
  - Pick-up Marek's patch to fix SERDES IRQ enablement and
status reading for 10G on 6393x.
---
 drivers/net/dsa/mv88e6xxx/chip.c| 136 
 drivers/net/dsa/mv88e6xxx/chip.h|   4 +
 drivers/net/dsa/mv88e6xxx/global1.h |   2 +
 drivers/net/dsa/mv88e6xxx/global2.h |   8 +
 drivers/net/dsa/mv88e6xxx/port.c| 230 
 drivers/net/dsa/mv88e6xxx/port.h|  43 +++-
 drivers/net/dsa/mv88e6xxx/serdes.c  | 319 
 drivers/net/dsa/mv88e6xxx/serdes.h  |  44 
 8 files changed, 784 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9bddd70449c6..ab929d9d93f3 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -635,6 +635,24 @@ static void mv88e6390x_phylink_validate(struct 
mv88e6xxx_chip *chip, int port,
mv88e6390_phylink_validate(chip, port, mask, state);
 }
 
+static void mv88e6393x_phylink_validate(struct mv88e6xxx_chip *chip, int port,
+   unsigned long *mask,
+   struct phylink_link_state *state)
+{
+   if (port == 0 || port == 9 || port == 10) {
+   phylink_set(mask, 1baseT_Full);
+   phylink_set(mask, 1baseKR_Full);
+   phylink_set(mask, 5000baseT_Full);
+   phylink_set(mask, 2500baseX_Full);
+   phylink_set(mask, 2500baseT_Full);
+   }
+
+   phylink_set(mask, 1000baseT_Full);
+   phylink_set(mask, 1000baseX_Full);
+
+   mv88e6065_phylink_validate(chip, port, mask, state);
+}
+
 static void mv88e6xxx_validate(struct dsa_switch *ds, int port,
   unsigned long *supported,
   struct phylink_link_state *state)
@@ -3944,6 +3962,55 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
.phylink_validate = mv88e6390_phylink_validate,
 };
 
+static const struct mv88e6xxx_ops mv88e6393x_ops = {
+   /* MV88E6XXX_FAMILY_6393 */
+   .setup_errata = mv88e6393x_serdes_setup_errata,
+   .irl_init_all = mv88e6390_g2_irl_init_all,
+   .get_eeprom = mv88e6xxx_g2_get_eeprom8,
+   .set_eeprom = mv88e6xxx_g2_set_eeprom8,
+   .set_switch_mac = mv88e6xxx_g2_set_switch_mac,
+   .phy_read = mv88e6xxx_g2_smi_phy_read,
+   .phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
+   .port_set_speed_duplex = mv88e6393x_port_set_speed_duplex,
+   .port_set_rgmii_delay = mv88e6390_port_set_rgmii_delay,
+   .port_tag_remap = mv88e6390_port_tag_remap,
+   .port_set_frame_mode = mv88e6351_port_set_frame_mode,
+   .port_set_egress_floods = mv88e6352_port_set_egress_floods,
+   .port_set_ether_type = mv88e6393x_port_set_ether_type,
+   .port_set_jumbo_size = mv88e6165_port_set_jumbo_size,
+   .port_egress_rate_limiting = mv88e6097_port_egress_rate_limiting,
+   .port_pause_limit = mv88e6390_port_pause_limit,
+   .port_set_cmode = mv88e6393x_port_set_cmode,
+   .port_disable_learn_limit = mv88e6xxx_port_disable_learn_limit,
+   .port_disable_pri_override = mv88e6xxx_port_disable_pri_override,
+   .port_get_cmode = mv88e6352_port_get_cmode,
+   .stats_snapshot = mv88e6390_g1_stats_snapshot,
+   .stats_set_histogram = mv88e6390_g1_stats_set_histogram,
+   .stats_get_sset_count = mv88e6320_stats_get_sset_count,
+   .stats_get_strings = mv88e6320_stats_get_strings,
+   .stats_get_stats = mv88e6390_stats_get_stats,
+   .set_cpu_port = mv88e6393x_port_set_cpu_dest,
+   .set_egress_port = mv88e6393x_set_egress_port,
+   .watchdog_ops = &mv88e6390_watchdog_ops,
+   .mgmt_rsvd2cpu = mv88e6393x_port_mgmt_rsvd2cpu,
+   .pot_clear = mv88e6xxx_g2_pot_clear,
+   .reset = mv88e6352_g1_reset,
+   .rmu_disable = mv88e6390_g1_rmu_disable,
+   .vtu_getnext = mv88e6390_g1_vtu_getnext,
+   .vtu_loadpurge = mv88e6390_g1_vtu_loadpurg

Re: Flaw in "random32: update the net random state on interrupt and activity"

2021-01-08 Thread Sedat Dilek

On Fri, Jan 8, 2021 at 2:08 PM Sedat Dilek  wrote:
>
> On Wed, Aug 12, 2020 at 6:25 PM Eric Dumazet  wrote:
>
> > > Also, I tried the diff for tcp_conn_request...
> > > With removing the call to prandom_u32() not useful for
> > > prandom_u32/tracing via perf.
> >
> > I am planning to send the TCP patch once net-next is open. (probably next 
> > week)
>
> Ping.
>
> What is the status of this?
>

I am attaching the updated diff against latest Linus Git.

- Sedat -
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c7e16b0ed791..95ed49de4635 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6852,10 +6852,12 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
 		if (!tmp_opt.tstamp_ok)
 			inet_rsk(req)->ecn_ok = 0;
+		tcp_rsk(req)->txhash = skb->hash ?: 1;
+	} else {
+		tcp_rsk(req)->txhash = net_tx_rndhash();
 	}
 
 	tcp_rsk(req)->snt_isn = isn;
-	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
 	tcp_openreq_init_rwin(req, sk, dst);
 	sk_rx_queue_set(req_to_sk(req), skb);

Re: [net-next PATCH v13 4/4] net: dsa: mv88e6xxx: Add support for mv88e6393x family of Marvell

2021-01-08 Thread Marek Behún

On Fri,  8 Jan 2021 19:50:56 +1000
Pavana Sharma  wrote:

> +int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane,
> + bool on)
> +{
> + u8 cmode;
> +
> + if (port != 0 && port != 9 && port != 10)
> + return -EOPNOTSUPP;
> +
> + cmode = chip->ports[port].cmode;
> +
> + mv88e6393x_serdes_port_config(chip, lane, on);
> +
> + switch (cmode) {
> + case MV88E6XXX_PORT_STS_CMODE_1000BASEX:
> + case MV88E6XXX_PORT_STS_CMODE_2500BASEX:
> + return mv88e6390_serdes_power_sgmii(chip, lane, on);
> + case MV88E6XXX_PORT_STS_CMODE_10GBASER:
> + return mv88e6390_serdes_power_10g(chip, lane, on);

Shoudln't mv88e6390_serdes_power_10g() be called even for 5GBASER ?
Have you tested 5GBASER at all?

Marek

Re: [PATCH 1/4] dt-bindings: net: renesas,etheravb: Add additional clocks

2021-01-08 Thread Geert Uytterhoeven

On Mon, Dec 28, 2020 at 10:32 PM Adam Ford  wrote:
> The AVB driver assumes there is an external clock, but it could
> be driven by an external clock.  In order to enable a programmable
> clock, it needs to be added to the clocks list and enabled in the
> driver.  Since there currently only one clock, there is no
> clock-names list either.
>
> Update bindings to add the additional optional clock, and explicitly
> name both of them.
>
> Signed-off-by: Adam Ford 

Reviewed-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [PATCH 2/4] ARM: dts: renesas: Add fck to etheravb-rcar-gen2 clock-names list

2021-01-08 Thread Geert Uytterhoeven

On Mon, Dec 28, 2020 at 10:32 PM Adam Ford  wrote:
> The bindings have been updated to support two clocks, but the
> original clock now requires the name fck.  Add a clock-names
> list in the device tree with fck in it.
>
> Signed-off-by: Adam Ford 

Reviewed-by: Geert Uytterhoeven 
i.e. will queue in renesas-devel once PATCH 1/4 has been accepted.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [PATCH 4/4] net: ethernet: ravb: Name the AVB functional clock fck

2021-01-08 Thread Geert Uytterhoeven

Hi Adam,

On Tue, Jan 5, 2021 at 1:53 PM Adam Ford  wrote:
> On Mon, Jan 4, 2021 at 4:41 AM Geert Uytterhoeven  
> wrote:
> > On Mon, Dec 28, 2020 at 10:32 PM Adam Ford  wrote:
> > > The bindings have been updated to support two clocks, but the
> > > original clock now requires the name fck to distinguish it
> > > from the other.
> > >
> > > Signed-off-by: Adam Ford 
> >
> > Thanks for your patch!
> >
> > > --- a/drivers/net/ethernet/renesas/ravb_main.c
> > > +++ b/drivers/net/ethernet/renesas/ravb_main.c
> > > @@ -2142,7 +2142,7 @@ static int ravb_probe(struct platform_device *pdev)
> > >
> > > priv->chip_id = chip_id;
> > >
> > > -   priv->clk = devm_clk_get(&pdev->dev, NULL);
> > > +   priv->clk = devm_clk_get(&pdev->dev, "fck");
> >
> > This change is not backwards compatible, as existing DTB files do not
> > have the "fck" clock.  So the driver has to keep on assuming the first
> > clock is the functional clock, and this patch is thus not needed nor
> > desired.
>
> Should I post a V2 with this removed, or can this patch just be excluded?

As far as I am concerned, it can just be excluded.
Patches 1 and 2+3 have to follow different maintainer paths anyway.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [PATCH 1/4] dt-bindings: net: renesas,etheravb: Add additional clocks

2021-01-08 Thread Geert Uytterhoeven

Hi Adam,

On Fri, Jan 8, 2021 at 3:11 PM Geert Uytterhoeven  wrote:
> On Mon, Dec 28, 2020 at 10:32 PM Adam Ford  wrote:
> > The AVB driver assumes there is an external clock, but it could
> > be driven by an external clock.  In order to enable a programmable
> > clock, it needs to be added to the clocks list and enabled in the
> > driver.  Since there currently only one clock, there is no
> > clock-names list either.
> >
> > Update bindings to add the additional optional clock, and explicitly
> > name both of them.
> >
> > Signed-off-by: Adam Ford 
>
> Reviewed-by: Geert Uytterhoeven 

Upon second look...

>> --- a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml
>> +++ b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml
>> @@ -49,7 +49,16 @@ properties:
>>interrupt-names: true
>>
>>clocks:
>> -maxItems: 1
>> +minItems: 1
>> +maxItems: 2
>> +items:
>> +  - description: AVB functional clock
>> +  - description: Optional TXC reference clock
>> +
>> +  clock-names:
>> +items:
>> +  - const: fck
>> +  - const: txc_refclk

On R-Car Gen3 and RZ/G2, RGMII is used, and this clock is called
"txcrefclk", i.e. without underscore.
On R-Car Gen2, GMII is used, and this clock is called "gtxrefclk".

Perhaps it should just be called "refclk"?

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [PATCH 3/4] arm64: dts: renesas: Add fck to etheravb-rcar-gen3 clock-names list

2021-01-08 Thread Geert Uytterhoeven

On Mon, Dec 28, 2020 at 10:32 PM Adam Ford  wrote:
> The bindings have been updated to support two clocks, but the
> original clock now requires the name fck.  Add a clock-names
> list in the device tree with fck in it.
>
> Signed-off-by: Adam Ford 

Reviewed-by: Geert Uytterhoeven 
i.e. will queue in renesas-devel once PATCH 1/4 has been accepted.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

[PATCH] changes for Pavana

2021-01-08 Thread Marek Behún

- some nitpicks are fixed here, mostly alignemnts
- also mv88e6393x_serdes_power() now enables 10g PHY not only for
  10gbase-r, but also for 5gbase-r mode

Pavana, you can apply this patch as previously:
  cd linux
  patch -p1 
---
 drivers/net/dsa/mv88e6xxx/port.c   | 24 +++-
 drivers/net/dsa/mv88e6xxx/port.h   | 12 
 drivers/net/dsa/mv88e6xxx/serdes.c | 46 --
 drivers/net/dsa/mv88e6xxx/serdes.h | 18 ++--
 4 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index c38fcb8163ce..7025c1e83beb 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -436,13 +436,11 @@ phy_interface_t mv88e6390x_port_max_speed_mode(int port)
 }
 
 /* Support 10, 100, 200, 1000, 2500, 5000, 1 Mbps (e.g. 88E6393X)
- * This function adds new speed 5000 supported by Amethyst family.
  * Function mv88e6xxx_port_set_speed_duplex() can't be used as the register
  * values for speeds 2500 & 5000 conflict.
  */
-
 int mv88e6393x_port_set_speed_duplex(struct mv88e6xxx_chip *chip, int port,
-   int speed, int duplex)
+int speed, int duplex)
 {
u16 reg, ctrl;
int err;
@@ -506,8 +504,8 @@ int mv88e6393x_port_set_speed_duplex(struct mv88e6xxx_chip 
*chip, int port,
return err;
 
reg &= ~(MV88E6XXX_PORT_MAC_CTL_SPEED_MASK |
-   MV88E6390_PORT_MAC_CTL_ALTSPEED |
-   MV88E6390_PORT_MAC_CTL_FORCE_SPEED);
+MV88E6390_PORT_MAC_CTL_ALTSPEED |
+MV88E6390_PORT_MAC_CTL_FORCE_SPEED);
 
if (speed != SPEED_UNFORCED)
reg |= MV88E6390_PORT_MAC_CTL_FORCE_SPEED;
@@ -543,7 +541,7 @@ static int mv88e6xxx_port_set_cmode(struct mv88e6xxx_chip 
*chip, int port,
cmode = MV88E6XXX_PORT_STS_CMODE_2500BASEX;
break;
case PHY_INTERFACE_MODE_5GBASER:
-   cmode = MV88E6XXX_PORT_STS_CMODE_5GBASER;
+   cmode = MV88E6393X_PORT_STS_CMODE_5GBASER;
break;
case PHY_INTERFACE_MODE_XGMII:
case PHY_INTERFACE_MODE_XAUI:
@@ -554,10 +552,10 @@ static int mv88e6xxx_port_set_cmode(struct mv88e6xxx_chip 
*chip, int port,
break;
case PHY_INTERFACE_MODE_10GBASER:
case PHY_INTERFACE_MODE_10GKR:
-   cmode = MV88E6XXX_PORT_STS_CMODE_10GBASER;
+   cmode = MV88E6393X_PORT_STS_CMODE_10GBASER;
break;
case PHY_INTERFACE_MODE_USXGMII:
-   cmode = MV88E6XXX_PORT_STS_CMODE_USXGMII;
+   cmode = MV88E6393X_PORT_STS_CMODE_USXGMII;
break;
default:
cmode = 0;
@@ -1278,9 +1276,8 @@ int mv88e6xxx_port_disable_pri_override(struct 
mv88e6xxx_chip *chip, int port)
 /* Offset 0x0E: Policy & MGMT Control Register for FAMILY 6191X 6193X 6393X */
 
 static int mv88e6393x_port_policy_write(struct mv88e6xxx_chip *chip, u16 
pointer,
-   u8 data)
+   u8 data)
 {
-
int err = 0;
int port;
u16 reg;
@@ -1299,8 +1296,8 @@ static int mv88e6393x_port_policy_write(struct 
mv88e6xxx_chip *chip, u16 pointer
 }
 
 int mv88e6393x_set_egress_port(struct mv88e6xxx_chip *chip,
-   enum mv88e6xxx_egress_direction direction,
-   int port)
+  enum mv88e6xxx_egress_direction direction,
+  int port)
 {
u16 ptr;
int err;
@@ -1319,6 +1316,7 @@ int mv88e6393x_set_egress_port(struct mv88e6xxx_chip 
*chip,
return err;
break;
}
+
return 0;
 }
 
@@ -1374,7 +1372,7 @@ static int mv88e6393x_port_epc_wait_ready(struct 
mv88e6xxx_chip *chip, int port)
 /* Port Ether type for 6393X family */
 
 int mv88e6393x_port_set_ether_type(struct mv88e6xxx_chip *chip, int port,
-   u16 etype)
+  u16 etype)
 {
u16 val;
int err;
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index 051665fa22d5..70a5b246b7e2 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -49,9 +49,9 @@
 #define MV88E6XXX_PORT_STS_CMODE_2500BASEX 0x000b
 #define MV88E6XXX_PORT_STS_CMODE_XAUI  0x000c
 #define MV88E6XXX_PORT_STS_CMODE_RXAUI 0x000d
-#define MV88E6XXX_PORT_STS_CMODE_5GBASER   0x000c
-#define MV88E6XXX_PORT_STS_CMODE_10GBASER  0x000d
-#define MV88E6XXX_PORT_STS_CMODE_USXGMII   0x000e
+#define MV88E6393X_PORT_STS_CMODE_5GBASER  0x000c
+#define MV88E6393X_PORT_STS_CMODE_10GBASER 0x000d
+#define MV88E6393X_PORT_STS_CMODE_USXGMII  0x000e
 #define MV88E6185_PORT_STS_CDUPLEX 0x0008
 #define MV88E6185_PORT_STS_CMODE_MASK

[PATCH net 0/2] mlxsw: core: Thermal control fixes

2021-01-08 Thread Ido Schimmel

From: Ido Schimmel 

This series includes two fixes for thermal control in mlxsw.

Patch #1 validates that the alarm temperature threshold read from a
transceiver is above the warning temperature threshold. If not, the
current thresholds are maintained. It was observed that some transceiver
might be unreliable and sometimes report a too low alarm temperature
threshold which would result in thermal shutdown of the system.

Patch #2 increases the temperature threshold above which thermal
shutdown is triggered for the ASIC thermal zone. It is currently too low
and might result in thermal shutdown under perfectly fine operational
conditions.

Please consider both patches for stable.

Vadim Pasternak (2):
  mlxsw: core: Add validation of transceiver temperature thresholds
  mlxsw: core: Increase critical threshold for ASIC thermal zone

 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

-- 
2.29.2

[PATCH net 2/2] mlxsw: core: Increase critical threshold for ASIC thermal zone

2021-01-08 Thread Ido Schimmel

From: Vadim Pasternak 

Increase critical threshold for ASIC thermal zone from 110C to 140C
according to the system hardware requirements. All the supported ASICs
(Spectrum-1, Spectrum-2, Spectrum-3) could be still operational with ASIC
temperature below 140C. With the old critical threshold value system
can perform unjustified shutdown.

All the systems equipped with the above ASICs implement thermal
protection mechanism at firmware level and firmware could decide to
perform system thermal shutdown in case the temperature is below 140C.
So with the new threshold system will not meltdown, while thermal
operating range will be aligned with hardware abilities.

Fixes: 41e760841d26 ("mlxsw: core: Replace thermal temperature trips with 
defines")
Fixes: a50c1e35650b ("mlxsw: core: Implement thermal zone")
Signed-off-by: Vadim Pasternak 
Reviewed-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c 
b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 250a85049697..bf85ce9835d7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -19,7 +19,7 @@
 #define MLXSW_THERMAL_ASIC_TEMP_NORM   75000   /* 75C */
 #define MLXSW_THERMAL_ASIC_TEMP_HIGH   85000   /* 85C */
 #define MLXSW_THERMAL_ASIC_TEMP_HOT105000  /* 105C */
-#define MLXSW_THERMAL_ASIC_TEMP_CRIT   11  /* 110C */
+#define MLXSW_THERMAL_ASIC_TEMP_CRIT   14  /* 140C */
 #define MLXSW_THERMAL_HYSTERESIS_TEMP  5000/* 5C */
 #define MLXSW_THERMAL_MODULE_TEMP_SHIFT(MLXSW_THERMAL_HYSTERESIS_TEMP 
* 2)
 #define MLXSW_THERMAL_ZONE_MAX_NAME16
-- 
2.29.2

[PATCH net 1/2] mlxsw: core: Add validation of transceiver temperature thresholds

2021-01-08 Thread Ido Schimmel

From: Vadim Pasternak 

Validate thresholds to avoid a single failure due to some transceiver
unreliability. Ignore the last readouts in case warning temperature is
above alarm temperature, since it can cause unexpected thermal
shutdown. Stay with the previous values and refresh threshold within
the next iteration.

This is a rare scenario, but it was observed at a customer site.

Fixes: 6a79507cfe94 ("mlxsw: core: Extend thermal module with per QSFP module 
thermal zones")
Signed-off-by: Vadim Pasternak 
Reviewed-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c 
b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 8fa286ccdd6b..250a85049697 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -176,6 +176,12 @@ mlxsw_thermal_module_trips_update(struct device *dev, 
struct mlxsw_core *core,
if (err)
return err;
 
+   if (crit_temp > emerg_temp) {
+   dev_warn(dev, "%s : Critical threshold %d is above emergency 
threshold %d\n",
+tz->tzdev->type, crit_temp, emerg_temp);
+   return 0;
+   }
+
/* According to the system thermal requirements, the thermal zones are
 * defined with four trip points. The critical and emergency
 * temperature thresholds, provided by QSFP module are set as "active"
@@ -190,11 +196,8 @@ mlxsw_thermal_module_trips_update(struct device *dev, 
struct mlxsw_core *core,
tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temp = crit_temp;
tz->trips[MLXSW_THERMAL_TEMP_TRIP_HIGH].temp = crit_temp;
tz->trips[MLXSW_THERMAL_TEMP_TRIP_HOT].temp = emerg_temp;
-   if (emerg_temp > crit_temp)
-   tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
+   tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
MLXSW_THERMAL_MODULE_TEMP_SHIFT;
-   else
-   tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp;
 
return 0;
 }
-- 
2.29.2

Re: [RFC PATCH net-next] bonding: add a vlan+srcmac tx hashing option

2021-01-08 Thread Jarod Wilson

On Fri, Jan 08, 2021 at 02:12:56PM +0100, Jiri Pirko wrote:
> Fri, Jan 08, 2021 at 12:58:13AM CET, ja...@redhat.com wrote:
> >On Mon, Dec 28, 2020 at 11:11:45AM +0100, Jiri Pirko wrote:
> >> Fri, Dec 18, 2020 at 08:30:33PM CET, ja...@redhat.com wrote:
> >> >This comes from an end-user request, where they're running multiple VMs on
> >> >hosts with bonded interfaces connected to some interest switch topologies,
> >> >where 802.3ad isn't an option. They're currently running a proprietary
> >> >solution that effectively achieves load-balancing of VMs and bandwidth
> >> >utilization improvements with a similar form of transmission algorithm.
> >> >
> >> >Basically, each VM has it's own vlan, so it always sends its traffic out
> >> >the same interface, unless that interface fails. Traffic gets split
> >> >between the interfaces, maintaining a consistent path, with failover still
> >> >available if an interface goes down.
> >> >
> >> >This has been rudimetarily tested to provide similar results, suitable for
> >> >them to use to move off their current proprietary solution.
> >> >
> >> >Still on the TODO list, if these even looks sane to begin with, is
> >> >fleshing out Documentation/networking/bonding.rst.
> >> 
> >> Jarod, did you consider using team driver instead ? :)
> >
> >That's actually one of the things that was suggested, since team I believe
> >already has support for this, but the user really wants to use bonding.
> >We're finding that a lot of users really still prefer bonding over team.
> 
> Do you know the reason, other than "nostalgia"?

I've heard a few different reasons that come to mind:

1) nostalgia is definitely one -- "we know bonding here"
2) support -- "the things I'm running say I need bonding to properly
support failover in their environment". How accurate this is, I don't
actually know.
3) monitoring -- "my monitoring solution knows about bonding, but not
about team". This is probably easily fixed, but may or may not be in the
user's direct control.
4) footprint -- "bonding does the job w/o team's userspace footprint".
I think this one is kind of hard for team to do anything about, bonding
really does have a smaller userspace footprint, which is a plus for
embedded type applications and high-security environments looking to keep
things as minimal as possible.

I think I've heard a few "we tried team years ago and it didn't work" as
well, which of course is ridiculous as a reason not to try something again,
since a lot can change in a few years in this world.

-- 
Jarod Wilson
ja...@redhat.com

Re: [PATCH] bus: mhi: Add inbound buffers allocation flag

2021-01-08 Thread Manivannan Sadhasivam

On Fri, Jan 08, 2021 at 03:01:59PM +0100, Loic Poulain wrote:
> Hi Mani,
> 
> On Fri, 8 Jan 2021 at 14:44, Manivannan Sadhasivam <
> manivannan.sadhasi...@linaro.org> wrote:
> 
> > On Wed, Jan 06, 2021 at 02:43:43PM +0100, Loic Poulain wrote:
> > > Currently, the MHI controller driver defines which channels should
> > > have their inbound buffers allocated and queued. But ideally, this is
> > > something that should be decided by the MHI device driver instead,
> >
> > We call them, "MHI client drivers"
> >
> 
> I'll fix that.
> 
> 
> > > which actually deals with that buffers.
> > >
> > > Add a flag parameter to mhi_prepare_for_transfer allowing to specify
> > > if buffers have to be allocated and queued by the MHI stack.
> > >
> > > Keep auto_queue flag for now, but should be removed at some point.
> > >
> > > Signed-off-by: Loic Poulain 
> > > ---
> > >  drivers/bus/mhi/core/internal.h |  2 +-
> > >  drivers/bus/mhi/core/main.c | 11 ---
> > >  drivers/net/mhi_net.c   |  2 +-
> > >  include/linux/mhi.h | 12 +++-
> > >  net/qrtr/mhi.c  |  2 +-
> > >  5 files changed, 22 insertions(+), 7 deletions(-)
> > >
> >
> > [...]
> >
> > > diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c
> > > index fa41d8c..b7f7f2e 100644
> > > --- a/drivers/net/mhi_net.c
> > > +++ b/drivers/net/mhi_net.c
> > > @@ -265,7 +265,7 @@ static int mhi_net_probe(struct mhi_device *mhi_dev,
> > >   u64_stats_init(&mhi_netdev->stats.tx_syncp);
> > >
> > >   /* Start MHI channels */
> > > - err = mhi_prepare_for_transfer(mhi_dev);
> > > + err = mhi_prepare_for_transfer(mhi_dev, 0);
> >
> > Eventhough I'd like Hemant to comment on this patch, AFAIU this looks to
> > me a controller dependent behaviour. The controller should have the
> > information whether a particular channel can auto queue or not then the
> > client driver can be agnostic.
> >
> 
> The client driver can not be agnostic if this information is defined on the
> controller side. In one case client driver needs to allocate (and queue)
> its own buffers and in the other case it uses the pre-allocated ones.
> Moreover, that will break compatibility if we have one controller (e.g. a
> Wifi MHI controller) that e.g. defines IPCR channels as pre-allocated and
> another one that defines IPCR channels as non-pre-allocated. Having
> pre-allocated channels is not something related to the MHI device but to
> how the host (client driver) wants to manage buffers. It would then make
> sense to let this choice to the client driver.
> 
> 
> >
> > >   if (err)
> > >   goto out_err;
> > >
> > > diff --git a/include/linux/mhi.h b/include/linux/mhi.h
> > > index 209b335..6723339 100644
> > > --- a/include/linux/mhi.h
> > > +++ b/include/linux/mhi.h
> > > @@ -60,6 +60,14 @@ enum mhi_flags {
> > >  };
> > >
> > >  /**
> > > + * enum mhi_chan_flags - MHI channel flags
> > > + * @MHI_CH_INBOUND_ALLOC_BUFS: Automatically allocate and queue inbound
> > buffers
> > > + */
> > > +enum mhi_chan_flags {
> > > + MHI_CH_INBOUND_ALLOC_BUFS = BIT(0),
> > > +};
> > > +
> > > +/**
> > >   * enum mhi_device_type - Device types
> > >   * @MHI_DEVICE_XFER: Handles data transfer
> > >   * @MHI_DEVICE_CONTROLLER: Control device
> > > @@ -705,8 +713,10 @@ void mhi_device_put(struct mhi_device *mhi_dev);
> > >  /**
> > >   * mhi_prepare_for_transfer - Setup channel for data transfer
> > >   * @mhi_dev: Device associated with the channels
> > > + * @flags: MHI channel flags
> > >   */
> > > -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
> > > +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev,
> > > +  enum mhi_chan_flags flags);
> > >
> > >  /**
> > >   * mhi_unprepare_from_transfer - Unprepare the channels
> > > diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c
> > > index 2bf2b19..47afded 100644
> > > --- a/net/qrtr/mhi.c
> > > +++ b/net/qrtr/mhi.c
> > > @@ -77,7 +77,7 @@ static int qcom_mhi_qrtr_probe(struct mhi_device
> > *mhi_dev,
> > >   int rc;
> > >
> > >   /* start channels */
> > > - rc = mhi_prepare_for_transfer(mhi_dev);
> > > + rc = mhi_prepare_for_transfer(mhi_dev, MHI_CH_INBOUND_ALLOC_BUFS);
> >
> > Are you sure it requires auto queued channel?
> >
> 
> This is how mhi-qrtr has been implemented, yes.
> 

skb is allocated in qrtr_endpoint_post(). Then how the host can pre
allocate the buffer here? Am I missing something?

Thanks,
Mani

> Regards,
> Loic

[PATCH] drivers: net: wireless: rtlwifi: fix bool comparison in expressions

2021-01-08 Thread Aditya Srivastava

There are certain conditional expressions in rtlwifi, where a boolean
variable is compared with true/false, in forms such as (foo == true) or
(false != bar), which does not comply with checkpatch.pl (CHECK:
BOOL_COMPARISON), according to which boolean variables should be
themselves used in the condition, rather than comparing with true/false

E.g., in drivers/net/wireless/realtek/rtlwifi/ps.c,
"if (find_p2p_ie == true)" can be replaced with "if (find_p2p_ie)"

Replace all such expressions with the bool variables appropriately

Signed-off-by: Aditya Srivastava 
---
- The changes made are compile tested
- Applies perfecly on next-20210108

 drivers/net/wireless/realtek/rtlwifi/ps.c | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/dm.c   | 8 
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c   | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192c/dm_common.c | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c   | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c  | 8 
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c 
b/drivers/net/wireless/realtek/rtlwifi/ps.c
index f99882255d48..629c03271bde 100644
--- a/drivers/net/wireless/realtek/rtlwifi/ps.c
+++ b/drivers/net/wireless/realtek/rtlwifi/ps.c
@@ -798,9 +798,9 @@ static void rtl_p2p_noa_ie(struct ieee80211_hw *hw, void 
*data,
ie += 3 + noa_len;
}
 
-   if (find_p2p_ie == true) {
+   if (find_p2p_ie) {
if ((p2pinfo->p2p_ps_mode > P2P_PS_NONE) &&
-   (find_p2p_ps_ie == false))
+   (!find_p2p_ps_ie))
rtl_p2p_ps_cmd(hw, P2P_PS_DISABLE);
}
 }
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/dm.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/dm.c
index d10c14c694da..6f61d6a10627 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/dm.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/dm.c
@@ -474,11 +474,11 @@ static void rtl88e_dm_dig(struct ieee80211_hw *hw)
u8 dm_dig_max, dm_dig_min;
u8 current_igi = dm_dig->cur_igvalue;
 
-   if (rtlpriv->dm.dm_initialgain_enable == false)
+   if (!rtlpriv->dm.dm_initialgain_enable)
return;
-   if (dm_dig->dig_enable_flag == false)
+   if (!dm_dig->dig_enable_flag)
return;
-   if (mac->act_scanning == true)
+   if (mac->act_scanning)
return;
 
if (mac->link_state >= MAC80211_LINKED)
@@ -1637,7 +1637,7 @@ static void rtl88e_dm_fast_ant_training(struct 
ieee80211_hw *hw)
}
}
 
-   if (bpkt_filter_match == false) {
+   if (!bpkt_filter_match) {
rtl_set_bbreg(hw, DM_REG_TXAGC_A_1_MCS32_11N,
  BIT(16), 0);
rtl_set_bbreg(hw, DM_REG_IGI_A_11N, BIT(7), 0);
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c
index bd9160b166c5..861cc663ca93 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c
@@ -1269,12 +1269,12 @@ void rtl88ee_set_check_bssid(struct ieee80211_hw *hw, 
bool check_bssid)
if (rtlpriv->psc.rfpwr_state != ERFON)
return;
 
-   if (check_bssid == true) {
+   if (check_bssid) {
reg_rcr |= (RCR_CBSSID_DATA | RCR_CBSSID_BCN);
rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_RCR,
  (u8 *)(®_rcr));
_rtl88ee_set_bcn_ctrl_reg(hw, 0, BIT(4));
-   } else if (check_bssid == false) {
+   } else if (!check_bssid) {
reg_rcr &= (~(RCR_CBSSID_DATA | RCR_CBSSID_BCN));
_rtl88ee_set_bcn_ctrl_reg(hw, BIT(4), 0);
rtlpriv->cfg->ops->set_hw_reg(hw,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/dm_common.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/dm_common.c
index 265a1a336304..0b6a15c2e5cc 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/dm_common.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/dm_common.c
@@ -380,7 +380,7 @@ static void rtl92c_dm_initial_gain_multi_sta(struct 
ieee80211_hw *hw)
initialized = false;
dm_digtable->dig_ext_port_stage = DIG_EXT_PORT_STAGE_MAX;
return;
-   } else if (initialized == false) {
+   } else if (!initialized) {
initialized = true;
dm_digtable->dig_ext_port_stage = DIG_EXT_PORT_STAGE_0;
dm_digtable->cur_igvalue = 0x20;
@@ -509,7 +509,7 @@ static void rtl92c_dm_dig(struct ieee80211_hw *hw)
 {
struct rtl_priv *rtlpriv = rtl_priv(hw);
 
-

Fw: [Bug 211085] New: No response from TCP connection in ESTALISHED state if sending data segment with unacceptable ACK

2021-01-08 Thread Stephen Hemminger

This looks like a case where some conformance test is testing a corner case in
the standard where the Linux TCP stack is not following the standard for valid 
reasons.
Linux behavior of silently dropping the packet would reduce DoS changes and 
information
leakage for MiTM attacks.

Begin forwarded message:

Date: Fri, 08 Jan 2021 08:17:40 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 211085] New: No response from TCP connection in ESTALISHED state 
if sending data segment with unacceptable ACK


https://bugzilla.kernel.org/show_bug.cgi?id=211085

Bug ID: 211085
   Summary: No response from TCP connection in ESTALISHED state if
sending data segment with unacceptable ACK
   Product: Networking
   Version: 2.5
Kernel Version: 5.4.24 and upstream
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: IPV4
  Assignee: step...@networkplumber.org
  Reporter: alex.thres...@googlemail.com
Regression: No

We're currently failing to pass the TCP_UNACCEPTABLE_04 test from TCP/IP tests
of TC8 ECU and Network Test specification (Open Alliance). The Test
specification is freely available on the Internet.
TCP_UNACCEPTABLE_04 validates the behavior of RFC793 'Chapter 3.4. 
Establishing a connection / Reset Generation (3)' (Page 36) and 'Chapter 9 -
Event Processing / SEGMENT ARRIVES / Otherwise / Check ACK field' (Page 71) 
which pointed out that sending an unacceptable ACK in ESTALISHED state for
example ACK acks a data segment not yet sent should be answered with an ACK
(from RFC793 - '...must elicit only an empty acknowledgment segment containing 
the current send-sequence number and an acknowledgment indicating the next
sequence number expected to be received...'). After that the segment needs to
be discarded. 
Unfortunately the segment is only discarded and not confirmed by ACK. I did
some research in the TCP/IP stack and found the code which is responsible for
discarding the ACK. 
It's in 'net/ipv4/tcp_input.c' in function tcp_ack currently on line 3727 of
upstream kernel.

>>/* If the ack includes data we haven't sent yet, discard
>> * this segment (RFC793 Section 3.9).
>> */
>>if (after(ack, tp->snd_nxt))
>>return -1;  

Validation was done on DUT target with Kernel 5.4.24.

Let me if you need further informations.

-- 
You may reply to this email to add a comment.

You are receiving this mail because:
You are the assignee for the bug.

Re: [PATCH] net: phy: micrel: reconfigure the phy on resume

2021-01-08 Thread Andrew Lunn

On Fri, Jan 08, 2021 at 05:45:54PM +0200, Claudiu Beznea wrote:
> KSZ9131 is used in setups with SAMA7G5. SAMA7G5 supports a special
> power saving mode (backup mode) that cuts the power for almost all
> parts of the SoC. The rail powering the ethernet PHY is also cut off.
> When resuming, in case the PHY has been configured on probe with
> slew rate or DLL settings these needs to be restored thus call
> driver's config_init() on resume.
> 
> Signed-off-by: Claudiu Beznea 

Reviewed-by: Andrew Lunn 

Andrew

Re: [PATCH iproute2 v2 1/1] build: Fix link errors on some systems

2021-01-08 Thread Stephen Hemminger

On Fri, 8 Jan 2021 14:08:57 +0100
Petr Machata  wrote:

> Stephen Hemminger  writes:
> 
> > On Thu,  7 Jan 2021 09:13:34 +0200
> > Roi Dayan  wrote:
> >  
> >> +#define _IS_JSON_CONTEXT(type) ((type & PRINT_JSON || type & PRINT_ANY) 
> >> && is_json_context())
> >> +#define _IS_FP_CONTEXT(type) (!is_json_context() && (type & PRINT_FP || 
> >> type & PRINT_ANY))  
> >
> > You could fold the comparisons? and why are the two options doing 
> > comparison in different order?
> >
> > #define _IS_JSON_CONTEXT(type) (is_json_context() && (type & (PRINT_JSON | 
> > PRINT_ANY))
> > #define _IS_FP_CONTEXT(type)   (!is_json_context() && (type & (PRINT_FP || 
> > PRINT_ANY))  
> 
> (s/||/|/)

Agree.
This was just an email edit, never tried

Re: [PATCH mlx5-next 1/4] PCI: Configure number of MSI-X vectors for SR-IOV VFs

2021-01-08 Thread Alex Williamson

On Fri, 8 Jan 2021 09:25:25 +0200
Leon Romanovsky  wrote:

> On Thu, Jan 07, 2021 at 10:54:38PM -0500, Don Dutile wrote:
> > On 1/7/21 7:57 PM, Bjorn Helgaas wrote:  
> > > [+cc Alex, Don]  
> 
> <...>
> 
> > > Help me connect the dots here.  Is this required because of something
> > > peculiar to mlx5, or is something like this required for all SR-IOV
> > > devices because of the way the PCIe spec is written?  
> > So, overall, I'm guessing the mlx5 device can have 1000's of MSIX -- say, 
> > one per send/receive/completion queue.
> > This device capability may exceed the max number MSIX a VM can have/support 
> > (depending on guestos).
> > So, a sysfs tunable is used to set the max MSIX available, and thus, the 
> > device puts >1 send/rcv/completion queue intr on a given MSIX.
> >
> > ok, time for Leon to better state what this patch does,
> > and why it's needed on mlx5 (and may be applicable to other/future 
> > high-MSIX devices assigned to VMs (NVME?)).
> > Hmmm, now that I said it, why is it SRIOV-centric and not pci-device 
> > centric (can pass a PF w/high number of MSIX to a VM).  
> 
> Thanks Don and Bjorn,
> 
> I will answer on all comments a little bit later when I will return
> to the office (Sunday).
> 
> However it is important for me to present the use case.
> 
> Our mlx5 SR-IOV devices were always capable to drive many MSI-X (upto 2K,
> don't catch me on exact number), however when user created VFs, the FW has
> no knowledge of how those VFs will be used. So FW had no choice but statically
> and equally assign same amount of MSI-X to all VFs.
> 
> After SR-IOV VF creation, user will bind those new VFs to the VMs, but
> the VMs have different number of CPUs and despite HW being able to deliver
> all needed number of vectors (in mlx5 netdev world, number of channels == 
> number
> of CPUs == number of vectors), we will be limited by already set low number
> of vectors.
> 
> So it is not for vector reduction, but more for vector re-partition.
> 
> As an example, imagine mlx5 with two VFs. One VF is bounded to VM with 200 
> CPUs
> and another is bounded to VM with 1 CPU. They need different amount of MSI-X 
> vectors.
> 
> Hope that I succeeded to explain :).

The idea is not unreasonable imo, but without knowing the size of the
vector pool, range available per vf, or ultimately whether the vf
supports this feature before we try to configure it, I don't see how
userspace is expected to make use of this in the general case.  If the
configuration requires such specific vf vector usage and pf driver
specific knowledge, I'm not sure it's fit as a generic pci-sysfs
interface.  Thanks,

Alex

Re: [PATCH] net: phy: micrel: reconfigure the phy on resume

2021-01-08 Thread Heiner Kallweit

On 08.01.2021 16:45, Claudiu Beznea wrote:
> KSZ9131 is used in setups with SAMA7G5. SAMA7G5 supports a special
> power saving mode (backup mode) that cuts the power for almost all
> parts of the SoC. The rail powering the ethernet PHY is also cut off.
> When resuming, in case the PHY has been configured on probe with
> slew rate or DLL settings these needs to be restored thus call
> driver's config_init() on resume.
> 
When would the SoC enter this backup mode? And would it suspend the
MDIO bus before cutting power to the PHY?
I'm asking because in mdio_bus_phy_restore() we call phy_init_hw()
already (that calls the driver's config_init).

> Signed-off-by: Claudiu Beznea 
> ---
>  drivers/net/phy/micrel.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
> index 3fe552675dd2..52d3a0480158 100644
> --- a/drivers/net/phy/micrel.c
> +++ b/drivers/net/phy/micrel.c
> @@ -1077,7 +1077,7 @@ static int kszphy_resume(struct phy_device *phydev)
>*/
>   usleep_range(1000, 2000);
>  
> - ret = kszphy_config_reset(phydev);
> + ret = phydev->drv->config_init(phydev);
>   if (ret)
>   return ret;
>  
>

[PATCH v5 net-next 00/16] Make .ndo_get_stats64 sleepable

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

There is a desire to standardize the counters that have previously been
reported through ethtool statistics into something that can be more
uniformly queried from user space. The ndo_get_stats64 are a good
candidate to keep standardized counters, as well as add new ones (like
RMON MIBs) but unfortunately, as things stand, migration will not be
smooth sailing if the ndo_get_stats64 does not offer the same calling
context as ethtool does. Namely, currently ndo_get_stats64 assumes
atomic context, and this series would like to change that.

The reason why sleeping while retrieving counters would be desirable in
the first place is that if we have hardware that needs to be accessed
through a slow bus like SPI, or through a firmware. Today we cannot do
that directly in .ndo_get_stats64, so we would have to poll counters
periodically and return a cached (not up to date) copy in the atomic NDO
callback. This is undesirable on both ends: more work than strictly
needed is being done, and the end result is also worse (not guaranteed
to be up to date). Also, retrieving counters from the hardware rather
than software counters incremented by the driver is more compatible with
the concept of interfaces with offload for L2 or L3 forwarding, where
the CPU otherwise never has a chance to keep accurate counters for most
of the traffic.

I could not test most of the drivers that were modified, so the only
guarantee is that make allyesconfig passes.

Changes in v5:
- Changed bonding and net_failover to use rcu_read_lock().
- Actually propagating errors from bond_get_stats now.

Changes in v4:
- Propagated errors from ndo_get_stats64.

Changes in v3:
- Resolved some memory leak issues in the bonding patch 10/12.

Changes in v2:
- Addressed the recursion issues in .ndo_get_stats64 from bonding and
  net_failover.
- Renamed netdev_lists_lock to netif_lists_lock
- Stopped taking netif_lists_lock from drivers as much as possible.

Vladimir Oltean (16):
  net: mark dev_base_lock for deprecation
  net: introduce a mutex for the netns interface lists
  net: procfs: hold netif_lists_lock when retrieving device statistics
  net: sysfs: don't hold dev_base_lock while retrieving device
statistics
  s390/appldata_net_sum: hold the netdev lists lock when retrieving
device statistics
  parisc/led: hold the netdev lists lock when retrieving device
statistics
  net: remove return value from dev_get_stats
  net: allow ndo_get_stats64 to return an int error code
  scsi: fcoe: propagate errors from dev_get_stats
  net: openvswitch: propagate errors from dev_get_stats
  net: propagate errors from dev_get_stats
  net: terminate errors from dev_get_stats
  net: openvswitch: ensure dev_get_stats can sleep
  net: net_failover: ensure .ndo_get_stats64 can sleep
  net: bonding: ensure .ndo_get_stats64 can sleep
  net: mark ndo_get_stats64 as being able to sleep

 Documentation/networking/netdevices.rst   |   8 +-
 Documentation/networking/statistics.rst   |   9 +-
 arch/s390/appldata/appldata_net_sum.c |  41 +++---
 drivers/infiniband/hw/hfi1/vnic_main.c|   6 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   9 +-
 .../infiniband/ulp/opa_vnic/opa_vnic_netdev.c |   9 +-
 drivers/leds/trigger/ledtrig-netdev.c |  16 ++-
 drivers/net/bonding/bond_main.c   | 131 ++
 drivers/net/dummy.c   |   6 +-
 drivers/net/ethernet/alacritech/slicoss.c |   6 +-
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |   8 +-
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |   6 +-
 drivers/net/ethernet/apm/xgene-v2/main.c  |   6 +-
 .../ethernet/apm/xgene/xgene_enet_ethtool.c   |   9 +-
 .../net/ethernet/apm/xgene/xgene_enet_main.c  |   7 +-
 drivers/net/ethernet/atheros/alx/main.c   |   6 +-
 drivers/net/ethernet/broadcom/b44.c   |   6 +-
 drivers/net/ethernet/broadcom/bcmsysport.c|   6 +-
 drivers/net/ethernet/broadcom/bnx2.c  |   5 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c |   4 +-
 drivers/net/ethernet/broadcom/tg3.c   |   8 +-
 drivers/net/ethernet/brocade/bna/bnad.c   |   4 +-
 drivers/net/ethernet/calxeda/xgmac.c  |   4 +-
 .../net/ethernet/cavium/liquidio/lio_main.c   |   6 +-
 .../ethernet/cavium/liquidio/lio_vf_main.c|   6 +-
 .../net/ethernet/cavium/liquidio/lio_vf_rep.c |   8 +-
 .../net/ethernet/cavium/thunder/nicvf_main.c  |   5 +-
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c   |   8 +-
 drivers/net/ethernet/cisco/enic/enic_main.c   |   8 +-
 drivers/net/ethernet/cortina/gemini.c |   6 +-
 drivers/net/ethernet/ec_bhf.c |   4 +-
 drivers/net/ethernet/emulex/benet/be_main.c   |   6 +-
 .../net/ethernet/freescale/dpaa/dpaa_eth.c|   6 +-
 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  |   6 +-
 drivers/net/ethernet/google/gve/gve_main.c|   4 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |   6 +-
 ...

[PATCH v5 net-next 07/16] net: remove return value from dev_get_stats

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

After commit 28172739f0a2 ("net: fix 64 bit counters on 32 bit arches"),
dev_get_stats got an additional argument for storage of statistics. At
this point, dev_get_stats could return either the passed "storage"
argument, or the output of .ndo_get_stats64.

Then commit caf586e5f23c ("net: add a core netdev->rx_dropped counter")
came, and the output of .ndo_get_stats64 (still returning a pointer to
struct rtnl_link_stats64) started being ignored.

Then came commit bc1f44709cf2 ("net: make ndo_get_stats64 a void
function") which made .ndo_get_stats64 stop returning anything.

So now, dev_get_stats always reports the "storage" pointer received as
argument. This is useless. Some drivers are dealing with unnecessary
complexity due to this, using another pointer to poke around the
returned statistics, when they can do that directly through the
stack-allocated struct rtnl_link_stats64.

Refactor these callers to ignore the return value completely and just
access the values from their struct rtnl_link_stats64 local variable.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
- Renamed "stats" to "dev_stats" in fcoe_transport.c.
- Slightly reworded the commit title and message.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
Patch is new.

 arch/s390/appldata/appldata_net_sum.c | 25 +
 drivers/leds/trigger/ledtrig-netdev.c |  9 ++--
 drivers/net/bonding/bond_main.c   |  7 ++-
 .../net/ethernet/hisilicon/hns/hns_ethtool.c  | 51 +--
 .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c  |  7 ++-
 drivers/net/ethernet/intel/ixgbevf/ethtool.c  |  7 ++-
 drivers/net/net_failover.c| 13 +++--
 drivers/parisc/led.c  |  9 ++--
 drivers/scsi/fcoe/fcoe_transport.c|  6 +--
 drivers/usb/gadget/function/rndis.c   | 45 ++--
 include/linux/netdevice.h |  3 +-
 net/8021q/vlanproc.c  | 15 +++---
 net/core/dev.c|  6 +--
 net/core/net-procfs.c | 35 ++---
 net/core/net-sysfs.c  |  7 +--
 net/openvswitch/vport.c   | 25 +
 16 files changed, 123 insertions(+), 147 deletions(-)

diff --git a/arch/s390/appldata/appldata_net_sum.c 
b/arch/s390/appldata/appldata_net_sum.c
index 4db886980cba..6146606ac9a3 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -81,19 +81,18 @@ static void appldata_get_net_sum_data(void *data)
netif_lists_lock(&init_net);
 
for_each_netdev(&init_net, dev) {
-   const struct rtnl_link_stats64 *stats;
-   struct rtnl_link_stats64 temp;
-
-   stats = dev_get_stats(dev, &temp);
-   rx_packets += stats->rx_packets;
-   tx_packets += stats->tx_packets;
-   rx_bytes   += stats->rx_bytes;
-   tx_bytes   += stats->tx_bytes;
-   rx_errors  += stats->rx_errors;
-   tx_errors  += stats->tx_errors;
-   rx_dropped += stats->rx_dropped;
-   tx_dropped += stats->tx_dropped;
-   collisions += stats->collisions;
+   struct rtnl_link_stats64 stats;
+
+   dev_get_stats(dev, &stats);
+   rx_packets += stats.rx_packets;
+   tx_packets += stats.tx_packets;
+   rx_bytes   += stats.rx_bytes;
+   tx_bytes   += stats.tx_bytes;
+   rx_errors  += stats.rx_errors;
+   tx_errors  += stats.tx_errors;
+   rx_dropped += stats.rx_dropped;
+   tx_dropped += stats.tx_dropped;
+   collisions += stats.collisions;
i++;
}
 
diff --git a/drivers/leds/trigger/ledtrig-netdev.c 
b/drivers/leds/trigger/ledtrig-netdev.c
index d5e774d83021..4382ee278309 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -347,9 +347,8 @@ static void netdev_trig_work(struct work_struct *work)
 {
struct led_netdev_data *trigger_data =
container_of(work, struct led_netdev_data, work.work);
-   struct rtnl_link_stats64 *dev_stats;
+   struct rtnl_link_stats64 dev_stats;
unsigned int new_activity;
-   struct rtnl_link_stats64 temp;
unsigned long interval;
int invert;
 
@@ -364,12 +363,12 @@ static void netdev_trig_work(struct work_struct *work)
!test_bit(NETDEV_LED_RX, &trigger_data->mode))
return;
 
-   dev_stats = dev_get_stats(trigger_data->net_dev, &temp);
+   dev_get_stats(trigger_data->net_dev, &dev_stats);
new_activity =
(test_bit(NETDEV_LED_TX, &trigger_data->mode) ?
-   dev_stats->tx_packets : 0) +
+   dev_stats.tx_packets : 0) +
(test_bit(NETDEV_LED_RX, &trigger_data->mode) ?
-   dev_stats->rx_packets :

[PATCH v5 net-next 06/16] parisc/led: hold the netdev lists lock when retrieving device statistics

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

In the effort of making .ndo_get_stats64 be able to sleep, we need to
ensure the callers of dev_get_stats do not use atomic context.

The LED driver for HP-PARISC workstations uses a workqueue to
periodically check for updates in network interface statistics, and
flicker when those have changed (i.e. there has been activity on the
line). Ignoring the fact that this driver is completely duplicating
drivers/leds/trigger/ledtrig-netdev.c, there is an even bigger problem.
Now, the dev_get_stats call can sleep, and iterating through the list of
network interfaces still needs to ensure the integrity of list of
network interfaces. So that leaves us only one locking option given the
current design of the network stack, and that is the netns mutex.

This patch also reindents the code that gathers device statistics, since
the standard in the Linux kernel is to use one tab character per
indentation level.

Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
Cc: linux-par...@vger.kernel.org
Signed-off-by: Vladimir Oltean 
---
Changes in v5:
Squashed with the reindenting of the code.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
None.

 drivers/parisc/led.c | 38 ++
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 36c6613f7a36..c8c6b2301dc9 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -38,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -355,22 +354,29 @@ static __inline__ int led_get_net_activity(void)
int retval;
 
rx_total = tx_total = 0;
-   
-   /* we are running as a workqueue task, so we can use an RCU lookup */
-   rcu_read_lock();
-   for_each_netdev_rcu(&init_net, dev) {
-   const struct rtnl_link_stats64 *stats;
-   struct rtnl_link_stats64 temp;
-   struct in_device *in_dev = __in_dev_get_rcu(dev);
-   if (!in_dev || !in_dev->ifa_list)
-   continue;
-   if (ipv4_is_loopback(in_dev->ifa_list->ifa_local))
-   continue;
-   stats = dev_get_stats(dev, &temp);
-   rx_total += stats->rx_packets;
-   tx_total += stats->tx_packets;
+
+   /* we are running as a workqueue task, so we can sleep */
+   netif_lists_lock(&init_net);
+
+   for_each_netdev(&init_net, dev) {
+   struct in_device *in_dev = in_dev_get(dev);
+   const struct rtnl_link_stats64 *stats;
+   struct rtnl_link_stats64 temp;
+
+   if (!in_dev || !in_dev->ifa_list ||
+   ipv4_is_loopback(in_dev->ifa_list->ifa_local)) {
+   in_dev_put(in_dev);
+   continue;
+   }
+
+   in_dev_put(in_dev);
+
+   stats = dev_get_stats(dev, &temp);
+   rx_total += stats->rx_packets;
+   tx_total += stats->tx_packets;
}
-   rcu_read_unlock();
+
+   netif_lists_unlock(&init_net);
 
retval = 0;
 
-- 
2.25.1

Re: [net-next 15/19] can: tcan4x5x: rework SPI access

2021-01-08 Thread Jakub Kicinski

On Fri, 8 Jan 2021 11:07:26 +0100 Ahmad Fatoum wrote:
> >>> +struct __packed tcan4x5x_map_buf { 
> >>> + struct tcan4x5x_buf_cmd cmd; 
> >>> + u8 data[256 * sizeof(u32)]; 
> >>> +} cacheline_aligned; 
> >>
> >> Due to the packing of the struct tcan4x5x_buf_cmd it should have a length 
> >> of 4
> >> bytes. Without __packed, will the "u8 data" come directly after the cmd?  
> > 
> > Yup, u8 with no alignment attribute will follow the previous
> > field with no holes.  
> 
> __packed has a documentation benefit though. It documents that the author
> considers the current layout to be the only correct one. (and thus extra
> care should be taken when modifying it).

cacheline_aligned adds a big architecture dependent padding at the
end of this struct, so the size of this structure is architecture
dependent. Besides using packed forced the compiler to use byte by byte
loads on architectures without unaligned access, so __packed is not
free.

[PATCH v5 net-next 11/16] net: propagate errors from dev_get_stats

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

dev_get_stats can now return error codes. Take the remaining call sites
where those errors can be propagated, which are all trivial, and convert
them to look at that error code and stop processing.

The effects of simulating a kernel error (returning -ENOMEM) upon
existing programs or kernel interfaces:

- cat /proc/net/dev prints up until the interface that failed, and there
  it returns:
cat: read error: Cannot allocate memory

- ifstat simply returns this and prints nothing:
Error: Buffer too small for object.
Dump terminated

- ip -s -s link show behaves the same as ifstat.

- ifconfig prints only the info for the interfaces whose statistics did
  not fail.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
- Actually propagate errors from bonding and net_failover from within
  this patch.
- Properly propagating the dev_get_stats() error code from
  rtnl_fill_stats now, and not -EMSGSIZE.

Changes in v4:
Patch is new (Eric's suggestion).

 drivers/net/bonding/bond_main.c | 24 -
 drivers/net/net_failover.c  | 33 ++---
 drivers/parisc/led.c|  7 +-
 drivers/usb/gadget/function/rndis.c |  4 +++-
 net/8021q/vlanproc.c|  7 --
 net/core/net-procfs.c   | 16 ++
 net/core/net-sysfs.c|  4 +++-
 net/core/rtnetlink.c| 15 +
 8 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5a3178b3dba3..2352ef64486b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1757,7 +1757,12 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev,
 
slave_dev->priv_flags |= IFF_BONDING;
/* initialize slave stats */
-   dev_get_stats(new_slave->dev, &new_slave->slave_stats);
+   res = dev_get_stats(new_slave->dev, &new_slave->slave_stats);
+   if (res) {
+   slave_err(bond_dev, slave_dev, "dev_get_stats returned %d\n",
+ res);
+   goto err_close;
+   }
 
if (bond_is_lb(bond)) {
/* bond_alb_init_slave() must be called before all other stages 
since
@@ -2094,7 +2099,13 @@ static int __bond_release_one(struct net_device 
*bond_dev,
bond_sysfs_slave_del(slave);
 
/* recompute stats just before removing the slave */
-   bond_get_stats(bond->dev, &bond->bond_stats);
+   err = bond_get_stats(bond->dev, &bond->bond_stats);
+   if (err) {
+   slave_info(bond_dev, slave_dev, "dev_get_stats returned %d\n",
+  err);
+   unblock_netpoll_tx();
+   return err;
+   }
 
bond_upper_dev_unlink(bond, slave);
/* unregister rx_handler early so bond_handle_frame wouldn't be called
@@ -3743,7 +3754,7 @@ static int bond_get_stats(struct net_device *bond_dev,
struct list_head *iter;
struct slave *slave;
int nest_level = 0;
-
+   int res = 0;
 
rcu_read_lock();
 #ifdef CONFIG_LOCKDEP
@@ -3754,7 +3765,9 @@ static int bond_get_stats(struct net_device *bond_dev,
memcpy(stats, &bond->bond_stats, sizeof(*stats));
 
bond_for_each_slave_rcu(bond, slave, iter) {
-   dev_get_stats(slave->dev, &temp);
+   res = dev_get_stats(slave->dev, &temp);
+   if (res)
+   goto out;
 
bond_fold_stats(stats, &temp, &slave->slave_stats);
 
@@ -3763,10 +3776,11 @@ static int bond_get_stats(struct net_device *bond_dev,
}
 
memcpy(&bond->bond_stats, stats, sizeof(*stats));
+out:
spin_unlock(&bond->stats_lock);
rcu_read_unlock();
 
-   return 0;
+   return res;
 }
 
 static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int 
cmd)
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index e032ad1c5e22..7f70555e68d1 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -185,6 +185,7 @@ static int net_failover_get_stats(struct net_device *dev,
struct net_failover_info *nfo_info = netdev_priv(dev);
struct rtnl_link_stats64 temp;
struct net_device *slave_dev;
+   int err = 0;
 
spin_lock(&nfo_info->stats_lock);
memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
@@ -193,24 +194,29 @@ static int net_failover_get_stats(struct net_device *dev,
 
slave_dev = rcu_dereference(nfo_info->primary_dev);
if (slave_dev) {
-   dev_get_stats(slave_dev, &temp);
+   err = dev_get_stats(slave_dev, &temp);
+   if (err)
+   goto out;
net_failover_fold_stats(stats, &temp, &nfo_info->primary_stats);
memcpy(&nfo_info->primary_stats, &temp, sizeof(temp));
}
 
slave_dev = rcu_dereference(nfo_info->standby

[PATCH v5 net-next 10/16] net: openvswitch: propagate errors from dev_get_stats

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

The dev_get_stats function can now return an error code, so the code
that retrieves vport statistics and sends them through netlink needs to
propagate that error code.

Modify the drastic BUG_ON checks to operate only on the -EMSGSIZE error
code (the only error code previously possible), and not crash the kernel
in case dev_get_stats fails. This is in line with what rtnetlink.c does.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
Still keeping the BUG_ON condition except for the output of
ovs_vport_get_stats.

Changes in v4:
Patch is new (Eric's suggestion).

 net/openvswitch/datapath.c | 25 +++--
 net/openvswitch/vport.c| 10 --
 net/openvswitch/vport.h|  2 +-
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d6ef6cb9b26..160b8dc453da 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1987,7 +1987,10 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, 
struct sk_buff *skb,
goto nla_put_failure;
}
 
-   ovs_vport_get_stats(vport, &vport_stats);
+   err = ovs_vport_get_stats(vport, &vport_stats);
+   if (err)
+   goto error;
+
if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS,
  sizeof(struct ovs_vport_stats), &vport_stats,
  OVS_VPORT_ATTR_PAD))
@@ -2028,7 +2031,9 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport 
*vport, struct net *net,
 
retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
 GFP_KERNEL);
-   BUG_ON(retval < 0);
+   BUG_ON(retval == -EMSGSIZE);
+   if (retval)
+   return ERR_PTR(retval);
 
return skb;
 }
@@ -2173,6 +2178,9 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
  OVS_VPORT_CMD_NEW, GFP_KERNEL);
+   BUG_ON(err == -EMSGSIZE);
+   if (err)
+   goto exit_unlock_free;
 
new_headroom = netdev_get_fwd_headroom(vport->dev);
 
@@ -2181,7 +2189,6 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
else
netdev_set_rx_headroom(vport->dev, dp->max_headroom);
 
-   BUG_ON(err < 0);
ovs_unlock();
 
ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2234,7 +2241,9 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
  OVS_VPORT_CMD_SET, GFP_KERNEL);
-   BUG_ON(err < 0);
+   BUG_ON(err == -EMSGSIZE);
+   if (err)
+   goto exit_unlock_free;
 
ovs_unlock();
ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2274,7 +2283,9 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct 
genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
  OVS_VPORT_CMD_DEL, GFP_KERNEL);
-   BUG_ON(err < 0);
+   BUG_ON(err == -EMSGSIZE);
+   if (err)
+   goto exit_unlock_free;
 
/* the vport deletion may trigger dp headroom update */
dp = vport->dp;
@@ -2321,7 +2332,9 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct 
genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
  OVS_VPORT_CMD_GET, GFP_ATOMIC);
-   BUG_ON(err < 0);
+   BUG_ON(err == -EMSGSIZE);
+   if (err)
+   goto exit_unlock_free;
rcu_read_unlock();
 
return genlmsg_reply(reply, info);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 215a818bf9ce..e66c949fd97a 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -267,11 +267,15 @@ void ovs_vport_del(struct vport *vport)
  *
  * Must be called with ovs_mutex or rcu_read_lock.
  */
-void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
+int ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
 {
struct rtnl_link_stats64 dev_stats;
+   int err;
+
+   err = dev_get_stats(vport->dev, &dev_stats);
+   if (err)
+   return err;
 
-   dev_get_stats(vport->dev, &dev_stats);
stats->rx_errors  = dev_stats.rx_errors;
stats->tx_errors  = dev_stats.tx_errors;
stats->tx_dropped = dev_stats.tx_dropped;
@@ -281,6 +285,8 @@ void ovs_vport_get_stats(struct vport *vport, struct 
ovs_vpo

[PATCH v5 net-next 12/16] net: terminate errors from dev_get_stats

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

dev_get_stats can now return errors. Some call sites are coming from a
context that returns void (ethtool stats, workqueue context). So since
we can't report to the upper layer, do the next best thing: shout.

This patch wraps up the conversion of existing dev_get_stats callers, so
we can add the __must_check attribute now to ensure that future callers
keep doing this too.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
Added the __must_check to dev_get_stats.

Changes in v4:
Patch is new (Eric's suggestion).

 arch/s390/appldata/appldata_net_sum.c   | 10 --
 drivers/leds/trigger/ledtrig-netdev.c   |  9 -
 drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c |  9 +++--
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c|  7 ++-
 drivers/net/ethernet/intel/e1000e/ethtool.c |  9 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c|  9 +++--
 drivers/net/ethernet/intel/ixgbevf/ethtool.c|  9 +++--
 include/linux/netdevice.h   |  3 ++-
 net/core/dev.c  |  3 ++-
 9 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/arch/s390/appldata/appldata_net_sum.c 
b/arch/s390/appldata/appldata_net_sum.c
index 6146606ac9a3..72cb5344e488 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -58,11 +58,11 @@ struct appldata_net_sum_data {
  */
 static void appldata_get_net_sum_data(void *data)
 {
-   int i;
struct appldata_net_sum_data *net_data;
struct net_device *dev;
unsigned long rx_packets, tx_packets, rx_bytes, tx_bytes, rx_errors,
tx_errors, rx_dropped, tx_dropped, collisions;
+   int ret, i;
 
net_data = data;
net_data->sync_count_1++;
@@ -83,7 +83,13 @@ static void appldata_get_net_sum_data(void *data)
for_each_netdev(&init_net, dev) {
struct rtnl_link_stats64 stats;
 
-   dev_get_stats(dev, &stats);
+   ret = dev_get_stats(dev, &stats);
+   if (ret) {
+   netif_lists_unlock(&init_net);
+   netdev_err(dev, "dev_get_stats returned %d\n", ret);
+   return;
+   }
+
rx_packets += stats.rx_packets;
tx_packets += stats.tx_packets;
rx_bytes   += stats.rx_bytes;
diff --git a/drivers/leds/trigger/ledtrig-netdev.c 
b/drivers/leds/trigger/ledtrig-netdev.c
index 4382ee278309..c717b7e7dd81 100644
--- a/drivers/leds/trigger/ledtrig-netdev.c
+++ b/drivers/leds/trigger/ledtrig-netdev.c
@@ -351,6 +351,7 @@ static void netdev_trig_work(struct work_struct *work)
unsigned int new_activity;
unsigned long interval;
int invert;
+   int err;
 
/* If we dont have a device, insure we are off */
if (!trigger_data->net_dev) {
@@ -363,7 +364,13 @@ static void netdev_trig_work(struct work_struct *work)
!test_bit(NETDEV_LED_RX, &trigger_data->mode))
return;
 
-   dev_get_stats(trigger_data->net_dev, &dev_stats);
+   err = dev_get_stats(trigger_data->net_dev, &dev_stats);
+   if (err) {
+   netdev_err(trigger_data->net_dev,
+  "dev_get_stats returned %d\n", err);
+   return;
+   }
+
new_activity =
(test_bit(NETDEV_LED_TX, &trigger_data->mode) ?
dev_stats.tx_packets : 0) +
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
index ada70425b48c..aab6a81f0438 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -266,9 +266,14 @@ static void xgene_get_ethtool_stats(struct net_device 
*ndev,
 {
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct rtnl_link_stats64 stats;
-   int i;
+   int err, i;
+
+   err = dev_get_stats(ndev, &stats);
+   if (err) {
+   netdev_err(ndev, "dev_get_stats returned %d\n", err);
+   return;
+   }
 
-   dev_get_stats(ndev, &stats);
for (i = 0; i < XGENE_STATS_LEN; i++)
data[i] = *(u64 *)((char *)&stats + gstrings_stats[i].offset);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index ee2172011051..d05fa7b3f6e0 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -840,6 +840,7 @@ static void hns_get_ethtool_stats(struct net_device *netdev,
struct hns_nic_priv *priv = netdev_priv(netdev);
struct hnae_handle *h = priv->ae_handle;
struct rtnl_link_stats64 net_stats;
+   int err;
 
if (!h->dev->ops->get_stats || !h->dev->ops->update_stats) {
netdev_err(netdev, "get_stats or up

Re: [PATCH net 3/3] net-sysfs: move the xps cpus/rxqs retrieval in a common function

2021-01-08 Thread Alexander Duyck

On Fri, Jan 8, 2021 at 1:07 AM Antoine Tenart  wrote:
>
> Quoting Alexander Duyck (2021-01-07 17:38:35)
> > On Thu, Jan 7, 2021 at 12:54 AM Antoine Tenart  wrote:
> > >
> > > Quoting Alexander Duyck (2021-01-06 20:54:11)
> > > > On Wed, Jan 6, 2021 at 10:04 AM Antoine Tenart  
> > > > wrote:
> > >
> > > That would require to hold rcu_read_lock in the caller and I'd like to
> > > keep it in that function.
> >
> > Actually you could probably make it work if you were to pass a pointer
> > to the RCU pointer.
>
> That should work but IMHO that could be easily breakable by future
> changes as it's a bit tricky.
>
> > > > > if (dev->num_tc) {
> > > > > /* Do not allow XPS on subordinate device directly */
> > > > > num_tc = dev->num_tc;
> > > > > -   if (num_tc < 0) {
> > > > > -   ret = -EINVAL;
> > > > > -   goto err_rtnl_unlock;
> > > > > -   }
> > > > > +   if (num_tc < 0)
> > > > > +   return -EINVAL;
> > > > >
> > > > > /* If queue belongs to subordinate dev use its map */
> > > > > dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> > > > >
> > > > > tc = netdev_txq_to_tc(dev, index);
> > > > > -   if (tc < 0) {
> > > > > -   ret = -EINVAL;
> > > > > -   goto err_rtnl_unlock;
> > > > > -   }
> > > > > +   if (tc < 0)
> > > > > +   return -EINVAL;
> > > > > }
> > > > >
> > > >
> > > > So if we store the num_tc and nr_ids in the dev_maps structure then we
> > > > could simplify this a bit by pulling the num_tc info out of the
> > > > dev_map and only asking the Tx queue for the tc in that case and
> > > > validating it against (tc <0 || num_tc <= tc) and returning an error
> > > > if either are true.
> > > >
> > > > This would also allow us to address the fact that the rxqs feature
> > > > doesn't support the subordinate devices as you could pull out the bit
> > > > above related to the sb_dev and instead call that prior to calling
> > > > xps_queue_show so that you are operating on the correct device map.
> >
> > It probably would be necessary to pass dev and index if we pull the
> > netdev_get_tx_queue()->sb_dev bit out and performed that before we
> > called the xps_queue_show function. Specifically as the subordinate
> > device wouldn't match up with the queue device so we would be better
> > off pulling it out first.
>
> While I agree moving the netdev_get_tx_queue()->sb_dev bit out of
> xps_queue_show seems like a good idea for consistency, I'm not sure
> it'll work: dev->num_tc is not only used to retrieve the number of tc
> but also as a condition on not being 0. We have things like:
>
>   // Always done with the original dev.
>   if (dev->num_tc) {
>
>   [...]
>
>   // Can be a subordinate dev.
>   tc = netdev_txq_to_tc(dev, index);
>   }
>
> And after moving num_tc in the map, we'll have checks like:
>
>   if (dev_maps->num_tc != dev->num_tc)
>   return -EINVAL;

We shouldn't. That defeats the whole point and you will never be able
to rely on the num_tc in the device to remain constant. If we are
moving the value to an RCU accessible attribute we should just be
using that value. We can only use that check if we are in an rtnl_lock
anyway and we won't need that if we are just displaying the value.

The checks should only be used to verify the tc of the queue is within
the bounds of the num_tc of the xps_map.

> I don't think the subordinate dev holds the same num_tc value as dev.
> What's your take on this?

So if I recall the num_tc for the subordinate device would be negative
since the subordinate devices start at -1 and just further decrease.
That is yet another reason why we shouldn't be looking at the num_tc
of the device.

> > > > > -   mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL);
> > > > > -   if (!mask) {
> > > > > -   ret = -ENOMEM;
> > > > > -   goto err_rtnl_unlock;
> > > > > +   rcu_read_lock();
> > > > > +
> > > > > +   if (is_rxqs_map) {
> > > > > +   dev_maps = rcu_dereference(dev->xps_rxqs_map);
> > > > > +   nr_ids = dev->num_rx_queues;
> > > > > +   } else {
> > > > > +   dev_maps = rcu_dereference(dev->xps_cpus_map);
> > > > > +   nr_ids = nr_cpu_ids;
> > > > > +   if (num_possible_cpus() > 1)
> > > > > +   possible_mask = 
> > > > > cpumask_bits(cpu_possible_mask);
> > > > > }
> > > >
> >
> > I don't think we need the possible_mask check. That is mostly just an
> > optimization that was making use of an existing "for_each" loop macro.
> > If we are going to go through 0 through nr_ids then there is no need
> > for the possible_mask as we can just brute force our way through and
> > will not find CPU that aren't there since we couldn't have added

[PATCH v5 net-next 16/16] net: mark ndo_get_stats64 as being able to sleep

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

Now that all callers have been converted to not use atomic context when
calling dev_get_stats, it is time to update the documentation and put a
notice in the function that it expects process context.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
Updated the documentation.

 Documentation/networking/netdevices.rst | 8 ++--
 Documentation/networking/statistics.rst | 9 -
 net/core/dev.c  | 2 ++
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/netdevices.rst 
b/Documentation/networking/netdevices.rst
index 5a85fcc80c76..944599722c76 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -64,8 +64,12 @@ ndo_do_ioctl:
Context: process
 
 ndo_get_stats:
-   Synchronization: dev_base_lock rwlock.
-   Context: nominally process, but don't sleep inside an rwlock
+   Synchronization:
+   none. netif_lists_lock(net) might be held, but not guaranteed.
+   It is illegal to hold rtnl_lock() in this method, since it will
+   cause a lock inversion with netif_lists_lock and a deadlock.
+   Context:
+   process
 
 ndo_start_xmit:
Synchronization: __netif_tx_lock spinlock.
diff --git a/Documentation/networking/statistics.rst 
b/Documentation/networking/statistics.rst
index 234abedc29b2..ad3e353df0dd 100644
--- a/Documentation/networking/statistics.rst
+++ b/Documentation/networking/statistics.rst
@@ -155,11 +155,10 @@ Drivers must ensure best possible compliance with
 Please note for example that detailed error statistics must be
 added into the general `rx_error` / `tx_error` counters.
 
-The `.ndo_get_stats64` callback can not sleep because of accesses
-via `/proc/net/dev`. If driver may sleep when retrieving the statistics
-from the device it should do so periodically asynchronously and only return
-a recent copy from `.ndo_get_stats64`. Ethtool interrupt coalescing interface
-allows setting the frequency of refreshing statistics, if needed.
+Drivers may sleep when retrieving the statistics from the device, or they might
+read the counters periodically and only return in `.ndo_get_stats64` a recent
+copy collected asynchronously. In the latter case, the ethtool interrupt
+coalescing interface allows setting the frequency of refreshing statistics.
 
 Retrieving ethtool statistics is a multi-syscall process, drivers are advised
 to keep the number of statistics constant to avoid race conditions with
diff --git a/net/core/dev.c b/net/core/dev.c
index 30facac95d5e..afd0e226efd4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10409,6 +10409,8 @@ int __must_check dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
int err = 0;
 
+   might_sleep();
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
err = ops->ndo_get_stats64(dev, storage);
-- 
2.25.1

[PATCH v5 net-next 15/16] net: bonding: ensure .ndo_get_stats64 can sleep

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

There is an effort to convert .ndo_get_stats64 to sleepable context, and
for that to work, we need to prevent callers of dev_get_stats from using
atomic locking.

The bonding driver retrieves its statistics recursively from its lower
interfaces, with additional care to only count packets sent/received
while those lowers were actually enslaved to the bond - see commit
5f0c5f73e5ef ("bonding: make global bonding stats more reliable").

Since commit 87163ef9cda7 ("bonding: remove last users of bond->lock and
bond->lock itself"), the bonding driver uses the following protection
for its array of slaves: RCU for readers and rtnl_mutex for updaters.
This is not great because there is another movement [ somehow
simultaneous with the one to make .ndo_get_stats64 sleepable ] to reduce
driver usage of rtnl_mutex. This makes sense, because the rtnl_mutex has
become a very contended resource.

The aforementioned commit removed an interesting comment:

/* [...] we can't hold bond->lock [...] because we'll
 * deadlock. The only solution is to rely on the fact
 * that we're under rtnl_lock here, and the slaves
 * list won't change. This doesn't solve the problem
 * of setting the slave's MTU while it is
 * transmitting, but the assumption is that the base
 * driver can handle that.
 *
 * TODO: figure out a way to safely iterate the slaves
 * list, but without holding a lock around the actual
 * call to the base driver.
 */

The above summarizes pretty well the challenges we have with nested
bonding interfaces (bond over bond over bond over...), which need to be
addressed by a better locking scheme that also not relies on the bloated
rtnl_mutex for the update side of the slaves array. That issue is not
addressed here, but there is a way around it.

To solve the nesting problem, the simple way is to not hold any locks
when recursing into the slave netdev operation. We can "cheat" and use
dev_hold to take a reference on the slave net_device, which is enough to
ensure that netdev_wait_allrefs() waits until we finish, and the kernel
won't fault.

However, the slave structure might no longer be valid, just its
associated net_device. So we need to do some more work to ensure that
the slave exists after we took the statistics, and if it still does,
reapply the logic from Andy's commit 5f0c5f73e5ef.

Tested using the following two scripts running in parallel:

#!/bin/bash

while :; do
ip link del bond0
ip link del bond1
ip link add bond0 type bond mode 802.3ad
ip link add bond1 type bond mode 802.3ad
ip link set sw0p1 down && ip link set sw0p1 master bond0 && ip 
link set sw0p1 up
ip link set sw0p2 down && ip link set sw0p2 master bond0 && ip 
link set sw0p2 up
ip link set sw0p3 down && ip link set sw0p3 master bond0 && ip 
link set sw0p3 up
ip link set bond0 down && ip link set bond0 master bond1 && ip 
link set bond0 up
ip link set sw1p1 down && ip link set sw1p1 master bond1 && ip 
link set sw1p1 up
ip link set bond1 up
ip -s -s link show
cat /sys/class/net/bond1/statistics/*
done

#!/bin/bash

while :; do
echo spi2.0 > /sys/bus/spi/drivers/sja1105/unbind
echo spi2.0 > /sys/bus/spi/drivers/sja1105/bind
sleep 30
done

where the sja1105 driver was explicitly modified for the purpose of this
test to have a msleep(500) in its .ndo_get_stats64 method, to catch some
more potential races.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
- Use rcu_read_lock() and do not change the locking architecture of
  the driver.
- Gave some details on my testing procedure.

Changes in v4:
Now there is code to propagate errors.

Changes in v3:
- Added kfree(dev_stats);
- Removed the now useless stats_lock (bond->bond_stats and
  slave->slave_stats are now protected by bond->slaves_lock)

Changes in v2:
Switched to the new scheme of holding just a refcnt to the slave
interfaces while recursing with dev_get_stats.

 drivers/net/bonding/bond_main.c | 113 +++-
 include/net/bonding.h   |  54 +++
 2 files changed, 108 insertions(+), 59 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2352ef64486b..77c3a40adbf4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3705,80 +3705,75 @@ static void bond_fold_stats(struct rtnl_link_stats64 
*_res,
}
 }
 
-#ifdef CONFIG_LOCKDEP
-static int bond_get_lowest_level_rcu(struct net_device *dev)
-{
-   struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
-   struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
-   int cur = 0, max = 0;
-
-

[PATCH v5 net-next 13/16] net: openvswitch: ensure dev_get_stats can sleep

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

There is an effort to convert .ndo_get_stats64 to sleepable context, and
for that to work, we need to prevent callers of dev_get_stats from using
atomic locking.

The OVS vport driver calls ovs_vport_get_stats from
ovs_vport_cmd_fill_info, a function with 7 callers: 5 under ovs_lock() and
2 under rcu_read_lock(). The RCU-protected callers are the doit and
dumpit callbacks of the OVS_VPORT_CMD_GET genetlink event. Things have
been this way ever since the OVS introduction in commit ccb1352e76cf
("net: Add Open vSwitch kernel components."), probably so that
OVS_PORT_CMD_GET doesn't have to serialize with all the others through
ovs_mutex. Sadly, now they do have to, otherwise we don't have
protection while accessing the datapath and vport structures.

Convert all callers of ovs_vport_cmd_fill_info to assume ovs_mutex
protection. This means that we can get rid of the gfp argument, since
all callers are now sleepable, we can just use GFP_KERNEL for memory
allocation.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
Patch is new.

 net/openvswitch/datapath.c | 38 ++
 net/openvswitch/vport.c|  2 +-
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 160b8dc453da..318caa8f12c2 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1957,10 +1957,10 @@ static struct genl_family dp_datapath_genl_family 
__ro_after_init = {
.module = THIS_MODULE,
 };
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with ovs_mutex */
 static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
   struct net *net, u32 portid, u32 seq,
-  u32 flags, u8 cmd, gfp_t gfp)
+  u32 flags, u8 cmd)
 {
struct ovs_header *ovs_header;
struct ovs_vport_stats vport_stats;
@@ -1981,7 +1981,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, 
struct sk_buff *skb,
goto nla_put_failure;
 
if (!net_eq(net, dev_net(vport->dev))) {
-   int id = peernet2id_alloc(net, dev_net(vport->dev), gfp);
+   int id = peernet2id_alloc(net, dev_net(vport->dev), GFP_KERNEL);
 
if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
goto nla_put_failure;
@@ -2029,8 +2029,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport 
*vport, struct net *net,
if (!skb)
return ERR_PTR(-ENOMEM);
 
-   retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
-GFP_KERNEL);
+   retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
BUG_ON(retval == -EMSGSIZE);
if (retval)
return ERR_PTR(retval);
@@ -2038,7 +2037,7 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport 
*vport, struct net *net,
return skb;
 }
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with ovs_mutex */
 static struct vport *lookup_vport(struct net *net,
  const struct ovs_header *ovs_header,
  struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
@@ -2177,7 +2176,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct 
genl_info *info)
 
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_NEW, GFP_KERNEL);
+ OVS_VPORT_CMD_NEW);
BUG_ON(err == -EMSGSIZE);
if (err)
goto exit_unlock_free;
@@ -2240,7 +2239,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
 
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_SET, GFP_KERNEL);
+ OVS_VPORT_CMD_SET);
BUG_ON(err == -EMSGSIZE);
if (err)
goto exit_unlock_free;
@@ -2282,7 +2281,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct 
genl_info *info)
 
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
  info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_DEL, GFP_KERNEL);
+ OVS_VPORT_CMD_DEL);
BUG_ON(err == -EMSGSIZE);
if (err)
goto exit_unlock_free;
@@ -2324,23 +2323,23 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, 
struct genl_info *info)
if (!reply)
return -ENOMEM;
 
-   rcu_read_lock();
+   ovs_lock();
vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
err = PTR_ERR(v

[PATCH v5 net-next 09/16] scsi: fcoe: propagate errors from dev_get_stats

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

The FCoE callback for the Link Error Status Block retrieves the FCS
error count using dev_get_stats. This function can now return errors.
Propagate these all the way to the sysfs device attributes.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
Patch is new (Eric's suggestion).

 drivers/scsi/fcoe/fcoe_sysfs.c |  9 +++--
 drivers/scsi/fcoe/fcoe_transport.c | 24 +++-
 drivers/scsi/libfc/fc_rport.c  |  5 -
 include/scsi/fcoe_sysfs.h  | 12 ++--
 include/scsi/libfc.h   |  2 +-
 include/scsi/libfcoe.h |  8 
 6 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/drivers/scsi/fcoe/fcoe_sysfs.c b/drivers/scsi/fcoe/fcoe_sysfs.c
index af658aa38fed..a197e66ffa8a 100644
--- a/drivers/scsi/fcoe/fcoe_sysfs.c
+++ b/drivers/scsi/fcoe/fcoe_sysfs.c
@@ -139,8 +139,13 @@ static ssize_t show_fcoe_ctlr_device_##field(struct device 
*dev, \
char *buf)  \
 {  \
struct fcoe_ctlr_device *ctlr = dev_to_ctlr(dev);   \
-   if (ctlr->f->get_fcoe_ctlr_##field) \
-   ctlr->f->get_fcoe_ctlr_##field(ctlr);   \
+   int err;\
+   \
+   if (ctlr->f->get_fcoe_ctlr_##field) {   \
+   err = ctlr->f->get_fcoe_ctlr_##field(ctlr); \
+   if (err)\
+   return err; \
+   }   \
return snprintf(buf, sz, format_string, \
cast fcoe_ctlr_##field(ctlr));  \
 }
diff --git a/drivers/scsi/fcoe/fcoe_transport.c 
b/drivers/scsi/fcoe/fcoe_transport.c
index 213ee9efb044..5d19650e9bc1 100644
--- a/drivers/scsi/fcoe/fcoe_transport.c
+++ b/drivers/scsi/fcoe/fcoe_transport.c
@@ -166,15 +166,16 @@ EXPORT_SYMBOL_GPL(fcoe_link_speed_update);
  * Note, the Link Error Status Block (LESB) for FCoE is defined in FC-BB-6
  * Clause 7.11 in v1.04.
  */
-void __fcoe_get_lesb(struct fc_lport *lport,
-struct fc_els_lesb *fc_lesb,
-struct net_device *netdev)
+int __fcoe_get_lesb(struct fc_lport *lport,
+   struct fc_els_lesb *fc_lesb,
+   struct net_device *netdev)
 {
struct rtnl_link_stats64 dev_stats;
unsigned int cpu;
u32 lfc, vlfc, mdac;
struct fc_stats *stats;
struct fcoe_fc_els_lesb *lesb;
+   int err;
 
lfc = 0;
vlfc = 0;
@@ -190,8 +191,14 @@ void __fcoe_get_lesb(struct fc_lport *lport,
lesb->lesb_link_fail = htonl(lfc);
lesb->lesb_vlink_fail = htonl(vlfc);
lesb->lesb_miss_fka = htonl(mdac);
-   dev_get_stats(netdev, &dev_stats);
+
+   err = dev_get_stats(netdev, &dev_stats);
+   if (err)
+   return err;
+
lesb->lesb_fcs_error = htonl(dev_stats.rx_crc_errors);
+
+   return 0;
 }
 EXPORT_SYMBOL_GPL(__fcoe_get_lesb);
 
@@ -200,12 +207,11 @@ EXPORT_SYMBOL_GPL(__fcoe_get_lesb);
  * @lport: the local port
  * @fc_lesb: the link error status block
  */
-void fcoe_get_lesb(struct fc_lport *lport,
-struct fc_els_lesb *fc_lesb)
+int fcoe_get_lesb(struct fc_lport *lport, struct fc_els_lesb *fc_lesb)
 {
struct net_device *netdev = fcoe_get_netdev(lport);
 
-   __fcoe_get_lesb(lport, fc_lesb, netdev);
+   return __fcoe_get_lesb(lport, fc_lesb, netdev);
 }
 EXPORT_SYMBOL_GPL(fcoe_get_lesb);
 
@@ -215,14 +221,14 @@ EXPORT_SYMBOL_GPL(fcoe_get_lesb);
  * @ctlr_dev: The given fcoe controller device
  *
  */
-void fcoe_ctlr_get_lesb(struct fcoe_ctlr_device *ctlr_dev)
+int fcoe_ctlr_get_lesb(struct fcoe_ctlr_device *ctlr_dev)
 {
struct fcoe_ctlr *fip = fcoe_ctlr_device_priv(ctlr_dev);
struct net_device *netdev = fcoe_get_netdev(fip->lp);
struct fc_els_lesb *fc_lesb;
 
fc_lesb = (struct fc_els_lesb *)(&ctlr_dev->lesb);
-   __fcoe_get_lesb(fip->lp, fc_lesb, netdev);
+   return __fcoe_get_lesb(fip->lp, fc_lesb, netdev);
 }
 EXPORT_SYMBOL_GPL(fcoe_ctlr_get_lesb);
 
diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
index 56003208d2e7..cb299fef7a78 100644
--- a/drivers/scsi/libfc/fc_rport.c
+++ b/drivers/scsi/libfc/fc_rport.c
@@ -1633,6 +1633,7 @@ static void fc_rport_recv_rls_req(struct fc_rport_priv 
*rdata,
struct fc_els_lesb *lesb;
struct fc_seq_els_data rjt_data;
struct fc_host_statistics *hst;
+   int err;
 
lockdep_assert_held(&rdata->rp_mutex);
 
@@ -1659,7 +1660,9 @@ static void fc_rport_recv

[PATCH v5 net-next 08/16] net: allow ndo_get_stats64 to return an int error code

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

Some drivers need to do special tricks to comply with the new policy of
ndo_get_stats64 being sleepable. For example, the bonding driver, which
derives its stats from its lower interfaces, must recurse with
dev_get_stats into its lowers with no locks held. But for that to work,
it needs to dynamically allocate some memory for a refcounted copy of
its array of slave interfaces (because recursing unlocked means that the
original one is subject to disappearing). And since memory allocation
can fail under pressure, we should not let it go unnoticed, but instead
propagate the error code.

This patch converts all implementations of .ndo_get_stats64 to return
int, and propagates that to the dev_get_stats calling site. Error
checking will be done in further patches.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
Patch is new (Eric's suggestion).

 drivers/infiniband/hw/hfi1/vnic_main.c   |  6 --
 drivers/infiniband/ulp/ipoib/ipoib_main.c|  9 ++---
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c|  9 ++---
 drivers/net/bonding/bond_main.c  | 11 +++
 drivers/net/dummy.c  |  6 --
 drivers/net/ethernet/alacritech/slicoss.c|  6 --
 drivers/net/ethernet/amazon/ena/ena_netdev.c |  8 +---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |  6 --
 drivers/net/ethernet/apm/xgene-v2/main.c |  6 --
 drivers/net/ethernet/apm/xgene/xgene_enet_main.c |  7 ---
 drivers/net/ethernet/atheros/alx/main.c  |  6 --
 drivers/net/ethernet/broadcom/b44.c  |  6 --
 drivers/net/ethernet/broadcom/bcmsysport.c   |  6 --
 drivers/net/ethernet/broadcom/bnx2.c |  5 +++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c|  6 --
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c|  4 +++-
 drivers/net/ethernet/broadcom/tg3.c  |  8 +---
 drivers/net/ethernet/brocade/bna/bnad.c  |  4 +++-
 drivers/net/ethernet/calxeda/xgmac.c |  4 +++-
 drivers/net/ethernet/cavium/liquidio/lio_main.c  |  6 --
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c   |  6 --
 drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c|  8 +---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c |  5 +++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  |  8 +---
 drivers/net/ethernet/cisco/enic/enic_main.c  |  8 +---
 drivers/net/ethernet/cortina/gemini.c|  6 --
 drivers/net/ethernet/ec_bhf.c|  4 +++-
 drivers/net/ethernet/emulex/benet/be_main.c  |  6 --
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c   |  6 --
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c |  6 --
 drivers/net/ethernet/google/gve/gve_main.c   |  4 +++-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c|  6 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c  |  8 +---
 drivers/net/ethernet/huawei/hinic/hinic_main.c   |  6 --
 drivers/net/ethernet/ibm/ehea/ehea_main.c|  6 --
 drivers/net/ethernet/intel/e1000e/e1000.h|  4 ++--
 drivers/net/ethernet/intel/e1000e/netdev.c   |  6 --
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c  |  6 --
 drivers/net/ethernet/intel/i40e/i40e_main.c  | 10 ++
 drivers/net/ethernet/intel/ice/ice_main.c|  6 --
 drivers/net/ethernet/intel/igb/igb_main.c| 10 ++
 drivers/net/ethernet/intel/igc/igc_main.c|  6 --
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c|  6 --
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c|  6 --
 drivers/net/ethernet/marvell/mvneta.c|  4 +++-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c  |  4 +++-
 .../net/ethernet/marvell/octeontx2/nic/otx2_common.c |  6 --
 .../net/ethernet/marvell/octeontx2/nic/otx2_common.h |  4 ++--
 .../net/ethernet/marvell/prestera/prestera_main.c|  6 --
 drivers/net/ethernet/marvell/sky2.c  |  6 --
 drivers/net/ethernet/mediatek/mtk_eth_soc.c  |  6 --
 drivers/net/ethernet/mediatek/mtk_star_emac.c|  6 --
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   |  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c|  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c |  4 +++-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c|  4 +++-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h|  2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   |  4 +++-
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c   |  4 +++-
 drivers/net/ethernet/microchip/lan743x_main.c|  6 --
 drivers/net/ethernet/mscc/ocelot_net.c   |  6 --
 drivers/net/

[PATCH v5 net-next 14/16] net: net_failover: ensure .ndo_get_stats64 can sleep

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

The failover framework sets up a virtio_net interface [ when it has the
VIRTIO_NET_F_STANDBY feature ] and a VF interface, having the same MAC
address, in a standby/active relationship. When the active VF is
unplugged, the standby virtio_net temporarily kicks in.

The failover framework registers a common upper for the active and the
standby interface, which is what the application layer uses. This is
similar to bonding/team. The statistics of the upper interface are the
sum of the statistics of the active and of the standby interface.

There is an effort to convert .ndo_get_stats64 to sleepable context, and
for that to work, we need to prevent callers of dev_get_stats from using
atomic locking. The failover driver needs protection via an RCU
read-side critical section to access the standby and the active
interface. This has two features:
- It is atomic: this needs to change.
- It is reentrant: this is ok, because generally speaking, dev_get_stats
  is recursive, and taking global locks is a bad thing from a recursive
  context.

The existing logic can be rehashed just a little bit such that the
recursive dev_get_stats call will not be under any lock. We can achieve
that by "cheating" a little bit and use dev_hold() to take a reference
on the active and backup interfaces, and netdev_wait_allrefs() will just
have to wait until dev_get_stats() finishes.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
Use rcu_read_lock() and do not change the locking architecture of the
driver.

Changes in v4:
Now there is code to propagate errors.

Changes in v3:
None.

Changes in v2:
Switched to the new scheme of holding just a refcnt to the slave
interfaces while recursing with dev_get_stats.

 drivers/net/net_failover.c | 64 --
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 7f70555e68d1..3e8a4046c748 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -183,38 +183,64 @@ static int net_failover_get_stats(struct net_device *dev,
  struct rtnl_link_stats64 *stats)
 {
struct net_failover_info *nfo_info = netdev_priv(dev);
-   struct rtnl_link_stats64 temp;
-   struct net_device *slave_dev;
+   struct rtnl_link_stats64 primary_stats;
+   struct rtnl_link_stats64 standby_stats;
+   struct net_device *primary_dev;
+   struct net_device *standby_dev;
int err = 0;
 
-   spin_lock(&nfo_info->stats_lock);
-   memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
-
rcu_read_lock();
 
-   slave_dev = rcu_dereference(nfo_info->primary_dev);
-   if (slave_dev) {
-   err = dev_get_stats(slave_dev, &temp);
+   primary_dev = rcu_dereference(nfo_info->primary_dev);
+   if (primary_dev)
+   dev_hold(primary_dev);
+
+   standby_dev = rcu_dereference(nfo_info->standby_dev);
+   if (standby_dev)
+   dev_hold(standby_dev);
+
+   rcu_read_unlock();
+
+   /* Don't hold rcu_read_lock while calling dev_get_stats, just a
+* reference to ensure they won't get unregistered.
+*/
+   if (primary_dev) {
+   err = dev_get_stats(primary_dev, &primary_stats);
if (err)
goto out;
-   net_failover_fold_stats(stats, &temp, &nfo_info->primary_stats);
-   memcpy(&nfo_info->primary_stats, &temp, sizeof(temp));
}
 
-   slave_dev = rcu_dereference(nfo_info->standby_dev);
-   if (slave_dev) {
-   err = dev_get_stats(slave_dev, &temp);
+   if (standby_dev) {
+   err = dev_get_stats(standby_dev, &standby_stats);
if (err)
goto out;
-   net_failover_fold_stats(stats, &temp, &nfo_info->standby_stats);
-   memcpy(&nfo_info->standby_stats, &temp, sizeof(temp));
}
 
-out:
-   rcu_read_unlock();
+   spin_lock(&nfo_info->stats_lock);
+
+   memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
+
+   if (primary_dev) {
+   net_failover_fold_stats(stats, &primary_stats,
+   &nfo_info->primary_stats);
+   memcpy(&nfo_info->primary_stats, &primary_stats,
+  sizeof(primary_stats));
+   }
+   if (standby_dev) {
+   net_failover_fold_stats(stats, &standby_stats,
+   &nfo_info->standby_stats);
+   memcpy(&nfo_info->standby_stats, &standby_stats,
+  sizeof(standby_stats));
+   }
 
memcpy(&nfo_info->failover_stats, stats, sizeof(*stats));
+
spin_unlock(&nfo_info->stats_lock);
+out:
+   if (primary_dev)
+   dev_put(primary_dev);
+   if (standby_dev)
+   dev_put(standby_dev);
 
return err;
 }
@@ -728,6 +754,7 @@ stati

[PATCH v5 net-next 02/16] net: introduce a mutex for the netns interface lists

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

Currently, any writer that wants to alter the lists of network
interfaces (either the plain list net->dev_base_head, or the hash tables
net->dev_index_head and net->dev_name_head) can keep other writers at
bay using the RTNL mutex.

However, the RTNL mutex has become a very contended resource over the
years, so there is a movement to do finer grained locking.

This patch adds one more way for writers to the network interface lists
to serialize themselves. We assume that all writers to the network
interface lists are easily identifiable because the write side of
dev_base_lock also needs to be held (note that some instances of that
were deliberately skipped, since they only dealt with protecting the
operational state of the netdev).

Holding the RTNL mutex is now optional for new code that alters the
lists, since all relevant writers were made to also hold the new lock.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

 include/linux/netdevice.h   | 10 +
 include/net/net_namespace.h |  6 +
 net/core/dev.c  | 44 +
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ec3ac5d5bbf..8aae2386bd37 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4376,6 +4376,16 @@ static inline void netif_addr_unlock_bh(struct 
net_device *dev)
spin_unlock_bh(&dev->addr_list_lock);
 }
 
+static inline void netif_lists_lock(struct net *net)
+{
+   mutex_lock(&net->netif_lists_lock);
+}
+
+static inline void netif_lists_unlock(struct net *net)
+{
+   mutex_unlock(&net->netif_lists_lock);
+}
+
 /*
  * dev_addrs walker. Should be used only for read access. Call with
  * rcu_read_lock held.
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 29567875f428..cac64c3c7ce0 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -183,6 +183,12 @@ struct net {
struct sock *crypto_nlsk;
 #endif
struct sock *diag_nlsk;
+
+   /* Serializes writers to @dev_base_head, @dev_name_head and
+* @dev_index_head. It does _not_ protect the netif adjacency lists
+* (struct net_device::adj_list).
+*/
+   struct mutexnetif_lists_lock;
 } __randomize_layout;
 
 #include 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8e02240bb11c..53c12f92025c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -175,13 +175,16 @@ static struct napi_struct *napi_by_id(unsigned int 
napi_id);
  *
  * Pure readers should hold rcu_read_lock() which should protect them against
  * concurrent changes to the interface lists made by the writers. Pure writers
- * must serialize by holding the RTNL mutex while they loop through the list
- * and make changes to it.
+ * must serialize by holding the @net->netif_lists_lock mutex while they loop
+ * through the list and make changes to it.
+ *
+ * It is possible to hold the RTNL mutex for serializing the writers too, but
+ * this should be avoided in new code due to lock contention.
  *
  * It is also possible to hold the global rwlock_t @dev_base_lock for
  * protection (holding its read side as an alternative to rcu_read_lock, and
- * its write side as an alternative to the RTNL mutex), however this should not
- * be done in new code, since it is deprecated and pending removal.
+ * its write side as an alternative to @net->netif_lists_lock), however this
+ * should not be done in new code, since it is deprecated and pending removal.
  *
  * One other role of @dev_base_lock is to protect against changes in the
  * operational state of a network interface.
@@ -360,12 +363,14 @@ static void list_netdevice(struct net_device *dev)
 
ASSERT_RTNL();
 
+   netif_lists_lock(net);
write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
   dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
+   netif_lists_unlock(net);
 
dev_base_seq_inc(net);
 }
@@ -375,16 +380,20 @@ static void list_netdevice(struct net_device *dev)
  */
 static void unlist_netdevice(struct net_device *dev)
 {
+   struct net *net = dev_net(dev);
+
ASSERT_RTNL();
 
/* Unlink dev from the device chain */
+   netif_lists_lock(net);
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
+   netif_lists_unlock(net);
 
-   dev_base_seq_inc(dev_net(dev));
+   dev_base_seq_inc(net);
 }
 
 /*
@@ -850,11 +859,11 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  * @net: the applicable net namespace
  * @name:

[PATCH v5 net-next 05/16] s390/appldata_net_sum: hold the netdev lists lock when retrieving device statistics

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

In the effort of making .ndo_get_stats64 be able to sleep, we need to
ensure the callers of dev_get_stats do not use atomic context.

In the case of the appldata driver, an RCU read-side critical section is
used to ensure the integrity of the list of network interfaces, because
the driver iterates through all net devices in the netns to aggregate
statistics. We still need some protection against an interface
registering or deregistering, and the writer-side lock, the netns's
mutex, is fine for that, because it offers sleepable context.

The ops->callback function is called from under appldata_ops_mutex
protection, so this is proof that the context is sleepable and holding
a mutex is therefore fine.

Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: linux-s...@vger.kernel.org
Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
None.

 arch/s390/appldata/appldata_net_sum.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/s390/appldata/appldata_net_sum.c 
b/arch/s390/appldata/appldata_net_sum.c
index 59c282ca002f..4db886980cba 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -78,8 +78,9 @@ static void appldata_get_net_sum_data(void *data)
tx_dropped = 0;
collisions = 0;
 
-   rcu_read_lock();
-   for_each_netdev_rcu(&init_net, dev) {
+   netif_lists_lock(&init_net);
+
+   for_each_netdev(&init_net, dev) {
const struct rtnl_link_stats64 *stats;
struct rtnl_link_stats64 temp;
 
@@ -95,7 +96,8 @@ static void appldata_get_net_sum_data(void *data)
collisions += stats->collisions;
i++;
}
-   rcu_read_unlock();
+
+   netif_lists_unlock(&init_net);
 
net_data->nr_interfaces = i;
net_data->rx_packets = rx_packets;
-- 
2.25.1

[PATCH v5 net-next 04/16] net: sysfs: don't hold dev_base_lock while retrieving device statistics

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

In the effort of making .ndo_get_stats64 be able to sleep, we need to
ensure the callers of dev_get_stats do not use atomic context.

I need to preface this by saying that I have no idea why netstat_show
takes the dev_base_lock rwlock. Two things can be observed:
(a) it does not appear to be due to dev_isalive requiring it for some
reason, because broadcast_show() also calls dev_isalive() and has
had no problem existing since the beginning of git.
(b) the dev_get_stats function definitely does not need dev_base_lock
protection either. In fact, holding the dev_base_lock is the entire
problem here, because we want to make dev_get_stats sleepable, and
holding a rwlock gives us atomic context.

So since no protection seems to be necessary, just run unlocked while
retrieving the /sys/class/net/eth0/statistics/* values.

Cc: Christian Brauner 
Cc: Eric Dumazet 
Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
None.

 net/core/net-sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index daf502c13d6d..8604183678fc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -585,14 +585,13 @@ static ssize_t netstat_show(const struct device *d,
WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
offset % sizeof(u64) != 0);
 
-   read_lock(&dev_base_lock);
if (dev_isalive(dev)) {
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, 
&temp);
 
ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
-   read_unlock(&dev_base_lock);
+
return ret;
 }
 
-- 
2.25.1

[PATCH v5 net-next 03/16] net: procfs: hold netif_lists_lock when retrieving device statistics

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

In the effort of making .ndo_get_stats64 be able to sleep, we need to
ensure the callers of dev_get_stats do not use atomic context.

The /proc/net/dev file uses an RCU read-side critical section to ensure
the integrity of the list of network interfaces, because it iterates
through all net devices in the netns to show their statistics.

To offer the equivalent protection against an interface registering or
deregistering, while also remaining in sleepable context, we can use the
netns mutex for the interface lists.

Cc: Cong Wang 
Cc: Eric Dumazet 
Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
None.

 net/core/net-procfs.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index c714e6a9dad4..4784703c1e39 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -21,7 +21,7 @@ static inline struct net_device *dev_from_same_bucket(struct 
seq_file *seq, loff
unsigned int count = 0, offset = get_offset(*pos);
 
h = &net->dev_index_head[get_bucket(*pos)];
-   hlist_for_each_entry_rcu(dev, h, index_hlist) {
+   hlist_for_each_entry(dev, h, index_hlist) {
if (++count == offset)
return dev;
}
@@ -51,9 +51,11 @@ static inline struct net_device *dev_from_bucket(struct 
seq_file *seq, loff_t *p
  * in detail.
  */
 static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
-   __acquires(RCU)
 {
-   rcu_read_lock();
+   struct net *net = seq_file_net(seq);
+
+   netif_lists_lock(net);
+
if (!*pos)
return SEQ_START_TOKEN;
 
@@ -70,9 +72,10 @@ static void *dev_seq_next(struct seq_file *seq, void *v, 
loff_t *pos)
 }
 
 static void dev_seq_stop(struct seq_file *seq, void *v)
-   __releases(RCU)
 {
-   rcu_read_unlock();
+   struct net *net = seq_file_net(seq);
+
+   netif_lists_unlock(net);
 }
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
-- 
2.25.1

[PATCH v5 net-next 01/16] net: mark dev_base_lock for deprecation

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

There is a movement to eliminate the usage of dev_base_lock, which
exists since as far as I could track the kernel history down (the
"7a2deb329241 Import changeset" commit from the bitkeeper branch).

The dev_base_lock approach has multiple issues:
- It is global and not per netns.
- Its meaning has mutated over the years and the vast majority of
  current users is using it to protect against changes of network device
  state, as opposed to changes of the interface list.
- It is overlapping with other protection mechanisms introduced in the
  meantime, which have higher performance for readers, like RCU
  introduced in 2009 by Eric Dumazet for kernel 2.6.

The old comment that I just deleted made this distinction: writers
protect only against readers by holding dev_base_lock, whereas they need
to protect against other writers by holding the RTNL mutex. This is
slightly confusing/incorrect, since a rwlock_t cannot have more than one
concurrently running writer, so that explanation does not justify why
the RTNL mutex would be necessary.

Instead, Stephen Hemminger makes this clarification here:
https://lore.kernel.org/netdev/20201129211230.4d704931@hermes.local/T/#u

| There are really two different domains being covered by locks here.
|
| The first area is change of state of network devices. This has traditionally
| been covered by RTNL because there are places that depend on coordinating
| state between multiple devices. RTNL is too big and held too long but getting
| rid of it is hard because there are corner cases (like state changes from 
userspace
| for VPN devices).
|
| The other area is code that wants to do read access to look at list of 
devices.
| These pure readers can/should be converted to RCU by now. Writers should hold 
RTNL.

This patch edits the comment for dev_base_lock, minimizing its role in
the protection of network interface lists, and clarifies that it is has
other purposes as well.

Signed-off-by: Vladimir Oltean 
---
Changes in v5:
None.

Changes in v4:
None.

Changes in v3:
None.

Changes in v2:
None.

 net/core/dev.c | 33 -
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 7afbb642e203..8e02240bb11c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -169,23 +169,22 @@ static int call_netdevice_notifiers_extack(unsigned long 
val,
 static struct napi_struct *napi_by_id(unsigned int napi_id);
 
 /*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates.  This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
+ * The network interface list of a netns (@net->dev_base_head) and the hash
+ * tables per ifindex (@net->dev_index_head) and per name (@net->dev_name_head)
+ * are protected using the following rules:
+ *
+ * Pure readers should hold rcu_read_lock() which should protect them against
+ * concurrent changes to the interface lists made by the writers. Pure writers
+ * must serialize by holding the RTNL mutex while they loop through the list
+ * and make changes to it.
+ *
+ * It is also possible to hold the global rwlock_t @dev_base_lock for
+ * protection (holding its read side as an alternative to rcu_read_lock, and
+ * its write side as an alternative to the RTNL mutex), however this should not
+ * be done in new code, since it is deprecated and pending removal.
+ *
+ * One other role of @dev_base_lock is to protect against changes in the
+ * operational state of a network interface.
  */
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
-- 
2.25.1

[PATCH net 0/3] skb frag: kmap_atomic fixes

2021-01-08 Thread Willem de Bruijn

From: Willem de Bruijn 

skb frags may be backed by highmem and/or compound pages. Various
code calls kmap_atomic to safely access highmem pages. But this
needs additional care for compound pages. Fix a few issues:

patch 1 expect kmap mappings with CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
patch 2 fixes kmap_atomic + compound page support in skb_seq_read
patch 3 fixes kmap_atomic + compound page support in esp

Willem de Bruijn (3):
  net: support kmap_local forced debugging in skb_frag_foreach
  net: compound page support in skb_seq_read
  esp: avoid unneeded kmap_atomic call

 include/linux/skbuff.h |  3 ++-
 net/core/skbuff.c  | 28 +++-
 net/ipv4/esp4.c|  7 +--
 net/ipv6/esp6.c|  7 +--
 4 files changed, 27 insertions(+), 18 deletions(-)

-- 
2.30.0.284.gd98b1dd5eaa7-goog

[PATCH net 1/3] net: support kmap_local forced debugging in skb_frag_foreach

2021-01-08 Thread Willem de Bruijn

From: Willem de Bruijn 

Skb frags may be backed by highmem and/or compound pages. Highmem
pages need kmap_atomic mappings to access. But kmap_atomic maps a
single page, not the entire compound page.

skb_foreach_page iterates over an skb frag, in one step in the common
case, page by page only if kmap_atomic must be called for each page.
The decision logic is captured in skb_frag_must_loop.

CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP extends kmap from highmem to all
pages, to increase code coverage.

Extend skb_frag_must_loop to this new condition.

Link: 
https://lore.kernel.org/linux-mm/20210106180132.41dc2...@gandalf.local.home/
Fixes: 0e91a0c6984c ("mm/highmem: Provide CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP")
Reported-by: Steven Rostedt (VMware) 
Signed-off-by: Linus Torvalds 
Signed-off-by: Willem de Bruijn 
Tested-by: Steven Rostedt (VMware) 
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 333bcdc39635..c858adfb5a82 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -366,7 +366,7 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int 
delta)
 static inline bool skb_frag_must_loop(struct page *p)
 {
 #if defined(CONFIG_HIGHMEM)
-   if (PageHighMem(p))
+   if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
return true;
 #endif
return false;
-- 
2.30.0.284.gd98b1dd5eaa7-goog

[PATCH net 2/3] net: compound page support in skb_seq_read

2021-01-08 Thread Willem de Bruijn

From: Willem de Bruijn 

skb_seq_read iterates over an skb, returning pointer and length of
the next data range with each call.

It relies on kmap_atomic to access highmem pages when needed.

An skb frag may be backed by a compound page, but kmap_atomic maps
only a single page. There are not enough kmap slots to always map all
pages concurrently.

Instead, if kmap_atomic is needed, iterate over each page.

As this increases the number of calls, avoid this unless needed.
The necessary condition is captured in skb_frag_must_loop.

I tried to make the change as obvious as possible. It should be easy
to verify that nothing changes if skb_frag_must_loop returns false.

Tested:
  On an x86 platform with
CONFIG_HIGHMEM=y
CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP=y
CONFIG_NETFILTER_XT_MATCH_STRING=y

  Run
ip link set dev lo mtu 1500
iptables -A OUTPUT -m string --string 'badstring' -algo bm -j ACCEPT
dd if=/dev/urandom of=in bs=1M count=20
nc -l -p 8000 > /dev/null &
nc -w 1 -q 0 localhost 8000 < in

Signed-off-by: Willem de Bruijn 
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c  | 28 +++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c858adfb5a82..68ffd3f115c1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1203,6 +1203,7 @@ struct skb_seq_state {
struct sk_buff  *root_skb;
struct sk_buff  *cur_skb;
__u8*frag_data;
+   __u16   frag_off;
 };
 
 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f62cae3f75d8..4acf45154b17 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3442,6 +3442,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned 
int from,
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
+   st->frag_off = 0;
 }
 EXPORT_SYMBOL(skb_prepare_seq_read);
 
@@ -3496,14 +3497,27 @@ unsigned int skb_seq_read(unsigned int consumed, const 
u8 **data,
st->stepped_offset += skb_headlen(st->cur_skb);
 
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+   unsigned int pg_idx, pg_off, pg_sz;
+
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
-   block_limit = skb_frag_size(frag) + st->stepped_offset;
 
+   pg_idx = 0;
+   pg_off = skb_frag_off(frag);
+   pg_sz = skb_frag_size(frag);
+
+   if (skb_frag_must_loop(skb_frag_page(frag))) {
+   pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+   pg_off = offset_in_page(pg_off + st->frag_off);
+   pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+   PAGE_SIZE - pg_off);
+   }
+
+   block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
-   st->frag_data = 
kmap_atomic(skb_frag_page(frag));
+   st->frag_data = kmap_atomic(skb_frag_page(frag) 
+ pg_idx);
 
-   *data = (u8 *) st->frag_data + skb_frag_off(frag) +
+   *data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
 
return block_limit - abs_offset;
@@ -3514,8 +3528,12 @@ unsigned int skb_seq_read(unsigned int consumed, const 
u8 **data,
st->frag_data = NULL;
}
 
-   st->frag_idx++;
-   st->stepped_offset += skb_frag_size(frag);
+   st->stepped_offset += pg_sz;
+   st->frag_off += pg_sz;
+   if (st->frag_off == skb_frag_size(frag)) {
+   st->frag_off = 0;
+   st->frag_idx++;
+   }
}
 
if (st->frag_data) {
-- 
2.30.0.284.gd98b1dd5eaa7-goog

[PATCH net 3/3] esp: avoid unneeded kmap_atomic call

2021-01-08 Thread Willem de Bruijn

From: Willem de Bruijn 

esp(6)_output_head uses skb_page_frag_refill to allocate a buffer for
the esp trailer.

It accesses the page with kmap_atomic to handle highmem. But
skb_page_frag_refill can return compound pages, of which
kmap_atomic only maps the first underlying page.

skb_page_frag_refill does not return highmem, because flag
__GFP_HIGHMEM is not set. ESP uses it in the same manner as TCP.
That also does not call kmap_atomic, but directly uses page_address,
in skb_copy_to_page_nocache. Do the same for ESP.

This issue has become easier to trigger with recent kmap local
debugging feature CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP.

Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible")
Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible")
Signed-off-by: Willem de Bruijn 
Cc: Steffen Klassert 
---
 net/ipv4/esp4.c | 7 +--
 net/ipv6/esp6.c | 7 +--
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 8b07f3a4f2db..a3271ec3e162 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -443,7 +443,6 @@ static int esp_output_encap(struct xfrm_state *x, struct 
sk_buff *skb,
 int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info 
*esp)
 {
u8 *tail;
-   u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
@@ -485,14 +484,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff 
*skb, struct esp_info *
page = pfrag->page;
get_page(page);
 
-   vaddr = kmap_atomic(page);
-
-   tail = vaddr + pfrag->offset;
+   tail = page_address(page) + pfrag->offset;
 
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, 
esp->proto);
 
-   kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
 
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 52c2f063529f..2b804fcebcc6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -478,7 +478,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct 
sk_buff *skb,
 int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct 
esp_info *esp)
 {
u8 *tail;
-   u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
@@ -519,14 +518,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff 
*skb, struct esp_info
page = pfrag->page;
get_page(page);
 
-   vaddr = kmap_atomic(page);
-
-   tail = vaddr + pfrag->offset;
+   tail = page_address(page) + pfrag->offset;
 
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, 
esp->proto);
 
-   kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
 
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
-- 
2.30.0.284.gd98b1dd5eaa7-goog

Re: [PATCH] bus: mhi: Add inbound buffers allocation flag

2021-01-08 Thread Manivannan Sadhasivam

On Fri, Jan 08, 2021 at 04:46:49PM +0100, Loic Poulain wrote:
> Hi Mani,
> 
> On Fri, 8 Jan 2021 at 16:30, Manivannan Sadhasivam <
> manivannan.sadhasi...@linaro.org> wrote:
> 
> > > > >   /* start channels */
> > > > > - rc = mhi_prepare_for_transfer(mhi_dev);
> > > > > + rc = mhi_prepare_for_transfer(mhi_dev,
> > MHI_CH_INBOUND_ALLOC_BUFS);
> > > >
> > > > Are you sure it requires auto queued channel?
> > > >
> > >
> > > This is how mhi-qrtr has been implemented, yes.
> > >
> >
> > skb is allocated in qrtr_endpoint_post(). Then how the host can pre
> > allocate the buffer here? Am I missing something?
> >
> 
> The initial MHI buffer is pre-allocated by the MHI core, so that mhi-qrtr
> only has to register a dl_callback without having to allocate and queue its
> own buffers. On dl_callback mhi-qrtr calls qrtr_endpoint_post(data) which
> allocates an skb and copy the MHI buffer (data) into that skb. When
> mhi-qrtr dl_callback finishes, the MHI buffer is re-queued automatically by
> the MHI core.
> 

Oops... My bad! There is the "auto_queue" for dl chan. Sorry for the noise.

Thanks,
Mani

> Regards,
> Loic
> 
> 
> >
> > Thanks,
> > Mani
> >
> > > Regards,
> > > Loic
> >

Re: [PATCH v5 net-next 11/16] net: propagate errors from dev_get_stats

2021-01-08 Thread Vladimir Oltean

On Fri, Jan 08, 2021 at 06:31:54PM +0200, Vladimir Oltean wrote:
> From: Vladimir Oltean 
> 
> dev_get_stats can now return error codes. Take the remaining call sites
> where those errors can be propagated, which are all trivial, and convert
> them to look at that error code and stop processing.
> 
> The effects of simulating a kernel error (returning -ENOMEM) upon
> existing programs or kernel interfaces:
> 
> - cat /proc/net/dev prints up until the interface that failed, and there
>   it returns:
> cat: read error: Cannot allocate memory
> 
> - ifstat simply returns this and prints nothing:
> Error: Buffer too small for object.
> Dump terminated
> 
> - ip -s -s link show behaves the same as ifstat.

Note that at first I did not understand why ifstat and "ip -s -s link
show" return this message. But in the meantime I figured that it was due
to rtnetlink still returning -EMSGSIZE. That is fixed in this patch
series, but I forgot to update the commit message to reflect it. The
current output from these two commands is:

$ ifstat
RTNETLINK answers: Cannot allocate memory
Dump terminated
$ ip -s -s link show
RTNETLINK answers: Cannot allocate memory
Dump terminated

> 
> - ifconfig prints only the info for the interfaces whose statistics did
>   not fail.
> 
> Signed-off-by: Vladimir Oltean 
> ---
> Changes in v5:
> - Actually propagate errors from bonding and net_failover from within
>   this patch.
> - Properly propagating the dev_get_stats() error code from
>   rtnl_fill_stats now, and not -EMSGSIZE.
> 
> Changes in v4:
> Patch is new (Eric's suggestion).

Re: [PATCH v2 net-next 03/10] net: dsa: add ops for devlink-sb

2021-01-08 Thread Vladimir Oltean

On Fri, Jan 08, 2021 at 12:35:23AM +0100, Andrew Lunn wrote:
> > +static struct dsa_port *devlink_to_dsa_port(struct devlink_port *dlp)
> > +{
> > +   return container_of(dlp, struct dsa_port, devlink_port);
> > +}
> 
> I wonder if this should be moved to include/net/dsa.h next to the
> other little helpers used to convert between devlink structures and
> DSA structures?

I wrote this before your series for devlink regions. The way I'm using
devlink_to_dsa_port, I guess I can just replace it with your
dsa_devlink_port_to_port function.

[PATCH net] cxgb4/chtls: Fix tid stuck due to wrong update of qid

2021-01-08 Thread Ayush Sawal

TID stuck is seen when there is a race in
CPL_PASS_ACCEPT_RPL/CPL_ABORT_REQ and abort is arriving
before the accept reply, which sets the queue number.
In this case HW ends up sending CPL_ABORT_RPL_RSS to an
incorrect ingress queue.

Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition")
Signed-off-by: Rohit Maheshwari 
Signed-off-by: Ayush Sawal 
---
 drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h   |  7 
 .../chelsio/inline_crypto/chtls/chtls.h   |  4 ++
 .../chelsio/inline_crypto/chtls/chtls_cm.c| 35 ++--
 .../chelsio/inline_crypto/chtls/chtls_hw.c| 42 +++
 4 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h 
b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
index 92473dda55d9..22a0220123ad 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_tcb.h
@@ -40,6 +40,13 @@
 #define TCB_L2T_IX_M   0xfffULL
 #define TCB_L2T_IX_V(x)((x) << TCB_L2T_IX_S)
 
+#define TCB_T_FLAGS_W   1
+#define TCB_T_FLAGS_S   0
+#define TCB_T_FLAGS_M   0xULL
+#define TCB_T_FLAGS_V(x)((__u64)(x) << TCB_T_FLAGS_S)
+
+#define TCB_FIELD_COOKIE_TFLAG 1
+
 #define TCB_SMAC_SEL_W 0
 #define TCB_SMAC_SEL_S 24
 #define TCB_SMAC_SEL_M 0xffULL
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h 
b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
index 72bb123d53db..9e2378013642 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h
@@ -575,7 +575,11 @@ int send_tx_flowc_wr(struct sock *sk, int compl,
 void chtls_tcp_push(struct sock *sk, int flags);
 int chtls_push_frames(struct chtls_sock *csk, int comp);
 int chtls_set_tcb_tflag(struct sock *sk, unsigned int bit_pos, int val);
+void chtls_set_tcb_field_rpl_skb(struct sock *sk, u16 word,
+u64 mask, u64 val, u8 cookie,
+int through_l2t);
 int chtls_setkey(struct chtls_sock *csk, u32 keylen, u32 mode, int 
cipher_type);
+void chtls_set_quiesce_ctrl(struct sock *sk, int val);
 void skb_entail(struct sock *sk, struct sk_buff *skb, int flags);
 unsigned int keyid_to_addr(int start_addr, int keyid);
 void free_tls_keyid(struct sock *sk);
diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c 
b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
index 51dd030b3b36..0818d7fa484d 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c
@@ -32,6 +32,7 @@
 #include "chtls.h"
 #include "chtls_cm.h"
 #include "clip_tbl.h"
+#include "t4_tcb.h"
 
 /*
  * State transitions and actions for close.  Note that if we are in SYN_SENT
@@ -267,11 +268,14 @@ static void chtls_send_reset(struct sock *sk, int mode, 
struct sk_buff *skb)
if (sk->sk_state != TCP_SYN_RECV)
chtls_send_abort(sk, mode, skb);
else
-   goto out;
+   chtls_set_tcb_field_rpl_skb(sk, TCB_T_FLAGS_W,
+   TCB_T_FLAGS_V(TCB_T_FLAGS_M), 0,
+   TCB_FIELD_COOKIE_TFLAG, 1);
 
return;
 out:
-   kfree_skb(skb);
+   if (skb)
+   kfree_skb(skb);
 }
 
 static void release_tcp_port(struct sock *sk)
@@ -1949,6 +1953,8 @@ static void chtls_close_con_rpl(struct sock *sk, struct 
sk_buff *skb)
else if (tcp_sk(sk)->linger2 < 0 &&
 !csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN))
chtls_abort_conn(sk, skb);
+   else if (csk_flag_nochk(csk, CSK_TX_DATA_SENT))
+   chtls_set_quiesce_ctrl(sk, 0);
break;
default:
pr_info("close_con_rpl in bad state %d\n", sk->sk_state);
@@ -2292,6 +2298,28 @@ static int chtls_wr_ack(struct chtls_dev *cdev, struct 
sk_buff *skb)
return 0;
 }
 
+static int chtls_set_tcb_rpl(struct chtls_dev *cdev, struct sk_buff *skb)
+{
+   struct cpl_set_tcb_rpl *rpl = cplhdr(skb) + RSS_HDR;
+   unsigned int hwtid = GET_TID(rpl);
+   struct sock *sk;
+
+   sk = lookup_tid(cdev->tids, hwtid);
+
+   /* return EINVAL if socket doesn't exist */
+   if (!sk)
+   return -EINVAL;
+
+   /* Reusing the skb as size of cpl_set_tcb_field structure
+* is greater than cpl_abort_req
+*/
+   if (TCB_COOKIE_G(rpl->cookie) == TCB_FIELD_COOKIE_TFLAG)
+   chtls_send_abort(sk, CPL_ABORT_SEND_RST, NULL);
+
+   kfree_skb(skb);
+   return 0;
+}
+
 chtls_handler_func chtls_handlers[NUM_CPL_CMDS] = {
[CPL_PASS_OPEN_RPL] = chtls_pass_open_rpl,
[CPL_CLOSE_LISTSRV_RPL] = chtls_close_listsrv_rpl,
@@ -2304,5 +2332,6 @@ chtls_handler_func chtls_handlers[NUM_CPL_CMDS] = {

[PATCH v3 net-next 01/10] net: mscc: ocelot: auto-detect packet buffer size and number of frame references

2021-01-08 Thread Vladimir Oltean

From: Vladimir Oltean 

Instead of reading these values from the reference manual and writing
them down into the driver, it appears that the hardware gives us the
option of detecting them dynamically.

The number of frame references corresponds to what the reference manual
notes, however it seems that the frame buffers are reported as slightly
less than the books would indicate. On VSC9959 (Felix), the books say it
should have 128KB of packet buffer, but the registers indicate only
129840 bytes (126.79 KB). Also, the unit of measurement for FREECNT from
the documentation of all these devices is incorrect (taken from an older
generation). This was confirmed by Younes Leroul from Microchip support.

Not having anything better to do with these values at the moment* (this
will change soon), let's just print them.

*The frame buffer size is, in fact, used to calculate the tail dropping
watermarks.

Signed-off-by: Vladimir Oltean 
---
Changes in v3:
None.

Changes in v2:
- Fixed FREECNT multiplier after consulting with Microchip support.

 drivers/net/dsa/ocelot/felix.c |  1 -
 drivers/net/dsa/ocelot/felix.h |  1 -
 drivers/net/dsa/ocelot/felix_vsc9959.c |  1 -
 drivers/net/dsa/ocelot/seville_vsc9953.c   |  1 -
 drivers/net/ethernet/mscc/ocelot.c | 22 +-
 drivers/net/ethernet/mscc/ocelot_vsc7514.c |  1 -
 include/soc/mscc/ocelot.h  |  3 ++-
 include/soc/mscc/ocelot_qsys.h |  3 +++
 8 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 90c3c76f21b2..a2e06a0d1509 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -451,7 +451,6 @@ static int felix_init_structs(struct felix *felix, int 
num_phys_ports)
ocelot->map = felix->info->map;
ocelot->stats_layout= felix->info->stats_layout;
ocelot->num_stats   = felix->info->num_stats;
-   ocelot->shared_queue_sz = felix->info->shared_queue_sz;
ocelot->num_mact_rows   = felix->info->num_mact_rows;
ocelot->vcap= felix->info->vcap;
ocelot->ops = felix->info->ops;
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 4c717324ac2f..5434fe278d2c 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -15,7 +15,6 @@ struct felix_info {
const struct reg_field  *regfields;
const u32 *const*map;
const struct ocelot_ops *ops;
-   int shared_queue_sz;
int num_mact_rows;
const struct ocelot_stat_layout *stats_layout;
unsigned intnum_stats;
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c 
b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 2e5bbdca5ea4..9fffbad6ef9b 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1356,7 +1356,6 @@ static const struct felix_info felix_info_vsc9959 = {
.stats_layout   = vsc9959_stats_layout,
.num_stats  = ARRAY_SIZE(vsc9959_stats_layout),
.vcap   = vsc9959_vcap_props,
-   .shared_queue_sz= 128 * 1024,
.num_mact_rows  = 2048,
.num_ports  = 6,
.num_tx_queues  = FELIX_NUM_TC,
diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c 
b/drivers/net/dsa/ocelot/seville_vsc9953.c
index ebbaf6817ec8..b72813da6d9f 100644
--- a/drivers/net/dsa/ocelot/seville_vsc9953.c
+++ b/drivers/net/dsa/ocelot/seville_vsc9953.c
@@ -1181,7 +1181,6 @@ static const struct felix_info seville_info_vsc9953 = {
.stats_layout   = vsc9953_stats_layout,
.num_stats  = ARRAY_SIZE(vsc9953_stats_layout),
.vcap   = vsc9953_vcap_props,
-   .shared_queue_sz= 256 * 1024,
.num_mact_rows  = 2048,
.num_ports  = 10,
.mdio_bus_alloc = vsc9953_mdio_bus_alloc,
diff --git a/drivers/net/ethernet/mscc/ocelot.c 
b/drivers/net/ethernet/mscc/ocelot.c
index 0b9992bd6626..876c03e51bdc 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1366,7 +1366,7 @@ void ocelot_port_set_maxlen(struct ocelot *ocelot, int 
port, size_t sdu)
pause_stop);
 
/* Tail dropping watermarks */
-   atop_tot = (ocelot->shared_queue_sz - 9 * maxlen) /
+   atop_tot = (ocelot->packet_buffer_size - 9 * maxlen) /
   OCELOT_BUFFER_CELL_SZ;
atop = (9 * maxlen) / OCELOT_BUFFER_CELL_SZ;
ocelot_write_rix(ocelot, ocelot->ops->wm_enc(atop), SYS_ATOP, port);
@@ -1479,6 +1479,25 @@ static void ocelot_cpu_port_init(struct ocelot *ocelot)
 ANA_PORT_VLAN_CFG, cpu);
 }
 
+static void ocelot_detect_features(struct ocelot *ocel

1 2 3 >

1 - 100 of 224 matches

Mail list logo