Re: [PATCH v2 net-next 1/9] xdp: introduce mb in xdp_buff/xdp_frame

2020-09-04 Thread Jesper Dangaard Brouer
On Thu, 3 Sep 2020 18:07:05 -0700
Alexei Starovoitov  wrote:

> On Thu, Sep 03, 2020 at 10:58:45PM +0200, Lorenzo Bianconi wrote:
> > Introduce multi-buffer bit (mb) in xdp_frame/xdp_buffer to specify
> > if shared_info area has been properly initialized for non-linear
> > xdp buffers
> > 
> > Signed-off-by: Lorenzo Bianconi 
> > ---
> >  include/net/xdp.h | 8 ++--
> >  net/core/xdp.c| 1 +
> >  2 files changed, 7 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 3814fb631d52..42f439f9fcda 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -72,7 +72,8 @@ struct xdp_buff {
> > void *data_hard_start;
> > struct xdp_rxq_info *rxq;
> > struct xdp_txq_info *txq;
> > -   u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
> > +   u32 frame_sz:31; /* frame size to deduce data_hard_end/reserved 
> > tailroom*/
> > +   u32 mb:1; /* xdp non-linear buffer */
> >  };
> >  
> >  /* Reserve memory area at end-of data area.
> > @@ -96,7 +97,8 @@ struct xdp_frame {
> > u16 len;
> > u16 headroom;
> > u32 metasize:8;
> > -   u32 frame_sz:24;
> > +   u32 frame_sz:23;
> > +   u32 mb:1; /* xdp non-linear frame */  
> 
> Hmm. Last time I checked compilers were generating ugly code with bitfields.
> Not performant and not efficient.
> frame_sz is used in the fast path.
> I suspect the first hunk alone will cause performance degradation.
> Could you use normal u8 or u32 flag field?

For struct xdp_buff sure we can do this.  For struct xdp_frame, I'm not
sure, as it is a state compressed version of xdp_buff + extra
information.  The xdp_frame have been called skb-light, and I know
people (e.g Ahern) wants to add more info to this, vlan, RX-hash, csum,
and we must keep this to 1-cache-line, for performance reasons.

You do make a good point, that these bit-fields might hurt performance
more.  I guess, we need to test this.  As I constantly worry that we
will slowly kill XDP performance with a 1000 paper-cuts.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer



Re: [PATCH net-next 3/7] net: mvpp2: check first level interrupt status registers

2020-09-04 Thread Russell King - ARM Linux admin
On Thu, Sep 03, 2020 at 03:24:14AM +0200, Andrew Lunn wrote:
> On Wed, Sep 02, 2020 at 05:11:46PM +0100, Russell King wrote:
> > Check the first level interrupt status registers to determine how to
> > further process the port interrupt. We will need this to know whether
> > to invoke the link status processing and/or the PTP processing for
> > both XLG and GMAC.
> 
> As i said, i don't know this driver. Does the hardware actually have
> two MAC hardware blocks? One for 10Mbs->1G, and a second for > 1G?

Yes.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!


[PATCH net-next v2 0/6] Marvell PP2.2 PTP support

2020-09-04 Thread Russell King - ARM Linux admin
Hi,

This series adds PTP support for PP2.2 hardware to the mvpp2 driver.
Tested on the Macchiatobin eth1 port.

Note that on the Macchiatobin, eth0 uses a separate TAI block from
eth1, and there is no hardware synchronisation between the two.

 drivers/net/ethernet/marvell/Kconfig|   6 +
 drivers/net/ethernet/marvell/mvpp2/Makefile |   3 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h  | 202 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 410 ++---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_tai.c  | 467 
 5 files changed, 1038 insertions(+), 50 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_tai.c

v2: add Andrew's r-bs, squash patch 6 and patch 7.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!


[PATCH net-next v2 2/6] net: mvpp2: rename mis-named "link status" interrupt

2020-09-04 Thread Russell King
The link interrupt is used for way more than just the link status; it
comes from a collection of units to do with the port. The Marvell
documentation describes the interrupt as "GOP port X interrupt".

Since we are adding PTP support, and the PTP interrupt uses this,
rename it to be more inline with the documentation.

This interrupt is also mis-named in the DT binding, but we leave that
alone.

Reviewed-by: Andrew Lunn 
Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h|  2 +-
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 35 ++-
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index ecb5f4616a36..a2f787c83756 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -915,7 +915,7 @@ struct mvpp2_port {
 */
int gop_id;
 
-   int link_irq;
+   int port_irq;
 
struct mvpp2 *priv;
 
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 81473911a822..41ffae8d5357 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3036,7 +3036,7 @@ static void mvpp2_isr_handle_gmac_internal(struct 
mvpp2_port *port)
 }
 
 /* Per-port interrupt for link status changes */
-static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
+static irqreturn_t mvpp2_port_isr(int irq, void *dev_id)
 {
struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
 
@@ -4230,12 +4230,13 @@ static int mvpp2_open(struct net_device *dev)
valid = true;
}
 
-   if (priv->hw_version == MVPP22 && port->link_irq) {
-   err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
+   if (priv->hw_version == MVPP22 && port->port_irq) {
+   err = request_irq(port->port_irq, mvpp2_port_isr, 0,
  dev->name, port);
if (err) {
-   netdev_err(port->dev, "cannot request link IRQ %d\n",
-  port->link_irq);
+   netdev_err(port->dev,
+  "cannot request port link/ptp IRQ %d\n",
+  port->port_irq);
goto err_free_irq;
}
 
@@ -4246,7 +4247,7 @@ static int mvpp2_open(struct net_device *dev)
 
valid = true;
} else {
-   port->link_irq = 0;
+   port->port_irq = 0;
}
 
if (!valid) {
@@ -4290,8 +4291,8 @@ static int mvpp2_stop(struct net_device *dev)
 
if (port->phylink)
phylink_disconnect_phy(port->phylink);
-   if (port->link_irq)
-   free_irq(port->link_irq, port);
+   if (port->port_irq)
+   free_irq(port->port_irq, port);
 
mvpp2_irqs_deinit(port);
if (!port->has_tx_irqs) {
@@ -6056,16 +6057,16 @@ static int mvpp2_port_probe(struct platform_device 
*pdev,
goto err_free_netdev;
 
if (port_node)
-   port->link_irq = of_irq_get_byname(port_node, "link");
+   port->port_irq = of_irq_get_byname(port_node, "link");
else
-   port->link_irq = fwnode_irq_get(port_fwnode, port->nqvecs + 1);
-   if (port->link_irq == -EPROBE_DEFER) {
+   port->port_irq = fwnode_irq_get(port_fwnode, port->nqvecs + 1);
+   if (port->port_irq == -EPROBE_DEFER) {
err = -EPROBE_DEFER;
goto err_deinit_qvecs;
}
-   if (port->link_irq <= 0)
+   if (port->port_irq <= 0)
/* the link irq is optional */
-   port->link_irq = 0;
+   port->port_irq = 0;
 
if (fwnode_property_read_bool(port_fwnode, "marvell,loopback"))
port->flags |= MVPP2_F_LOOPBACK;
@@ -6229,8 +6230,8 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 err_free_stats:
free_percpu(port->stats);
 err_free_irq:
-   if (port->link_irq)
-   irq_dispose_mapping(port->link_irq);
+   if (port->port_irq)
+   irq_dispose_mapping(port->port_irq);
 err_deinit_qvecs:
mvpp2_queue_vectors_deinit(port);
 err_free_netdev:
@@ -6251,8 +6252,8 @@ static void mvpp2_port_remove(struct mvpp2_port *port)
for (i = 0; i < port->ntxqs; i++)
free_percpu(port->txqs[i]->pcpu);
mvpp2_queue_vectors_deinit(port);
-   if (port->link_irq)
-   irq_dispose_mapping(port->link_irq);
+   if (port->port_irq)
+   irq_dispose_mapping(port->port_irq);
free_netdev(port->dev);
 }
 
-- 
2.20.1



[PATCH net-next v2 4/6] net: mvpp2: ptp: add TAI support

2020-09-04 Thread Russell King
Add support for the TAI block in the mvpp2.2 hardware.

Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/Kconfig  |   6 +
 drivers/net/ethernet/marvell/mvpp2/Makefile   |   3 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h| 109 +
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   |   4 +
 .../net/ethernet/marvell/mvpp2/mvpp2_tai.c| 416 ++
 5 files changed, 537 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_tai.c

diff --git a/drivers/net/ethernet/marvell/Kconfig 
b/drivers/net/ethernet/marvell/Kconfig
index ef4f35ba077d..a599e44a36a8 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -92,6 +92,12 @@ config MVPP2
  This driver supports the network interface units in the
  Marvell ARMADA 375, 7K and 8K SoCs.
 
+config MVPP2_PTP
+   bool "Marvell Armada 8K Enable PTP support"
+   depends on NETWORK_PHY_TIMESTAMPING
+   depends on (PTP_1588_CLOCK = y && MVPP2 = y) || \
+  (PTP_1588_CLOCK && MVPP2 = m)
+
 config PXA168_ETH
tristate "Marvell pxa168 ethernet support"
depends on HAS_IOMEM
diff --git a/drivers/net/ethernet/marvell/mvpp2/Makefile 
b/drivers/net/ethernet/marvell/mvpp2/Makefile
index 51f65a202c6e..9bd8e7964b40 100644
--- a/drivers/net/ethernet/marvell/mvpp2/Makefile
+++ b/drivers/net/ethernet/marvell/mvpp2/Makefile
@@ -4,4 +4,5 @@
 #
 obj-$(CONFIG_MVPP2) := mvpp2.o
 
-mvpp2-objs := mvpp2_main.o mvpp2_prs.o mvpp2_cls.o mvpp2_debugfs.o
+mvpp2-y := mvpp2_main.o mvpp2_prs.o mvpp2_cls.o mvpp2_debugfs.o
+mvpp2-$(CONFIG_MVPP2_PTP) += mvpp2_tai.o
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 273c46bbf927..b9fae3870393 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -505,6 +505,70 @@
 #define MVPP22_SMI_MISC_CFG_REG0x1204
 #define MVPP22_SMI_POLLING_EN  BIT(10)
 
+/* TAI registers, PPv2.2 only, relative to priv->iface_base */
+#define MVPP22_TAI_INT_CAUSE   0x1400
+#define MVPP22_TAI_INT_MASK0x1404
+#define MVPP22_TAI_CR0 0x1408
+#define MVPP22_TAI_CR1 0x140c
+#define MVPP22_TAI_TCFCR0  0x1410
+#define MVPP22_TAI_TCFCR1  0x1414
+#define MVPP22_TAI_TCFCR2  0x1418
+#define MVPP22_TAI_FATWR   0x141c
+#define MVPP22_TAI_TOD_STEP_NANO_CR0x1420
+#define MVPP22_TAI_TOD_STEP_FRAC_HIGH  0x1424
+#define MVPP22_TAI_TOD_STEP_FRAC_LOW   0x1428
+#define MVPP22_TAI_TAPDC_HIGH  0x142c
+#define MVPP22_TAI_TAPDC_LOW   0x1430
+#define MVPP22_TAI_TGTOD_SEC_HIGH  0x1434
+#define MVPP22_TAI_TGTOD_SEC_MED   0x1438
+#define MVPP22_TAI_TGTOD_SEC_LOW   0x143c
+#define MVPP22_TAI_TGTOD_NANO_HIGH 0x1440
+#define MVPP22_TAI_TGTOD_NANO_LOW  0x1444
+#define MVPP22_TAI_TGTOD_FRAC_HIGH 0x1448
+#define MVPP22_TAI_TGTOD_FRAC_LOW  0x144c
+#define MVPP22_TAI_TLV_SEC_HIGH0x1450
+#define MVPP22_TAI_TLV_SEC_MED 0x1454
+#define MVPP22_TAI_TLV_SEC_LOW 0x1458
+#define MVPP22_TAI_TLV_NANO_HIGH   0x145c
+#define MVPP22_TAI_TLV_NANO_LOW0x1460
+#define MVPP22_TAI_TLV_FRAC_HIGH   0x1464
+#define MVPP22_TAI_TLV_FRAC_LOW0x1468
+#define MVPP22_TAI_TCV0_SEC_HIGH   0x146c
+#define MVPP22_TAI_TCV0_SEC_MED0x1470
+#define MVPP22_TAI_TCV0_SEC_LOW0x1474
+#define MVPP22_TAI_TCV0_NANO_HIGH  0x1478
+#define MVPP22_TAI_TCV0_NANO_LOW   0x147c
+#define MVPP22_TAI_TCV0_FRAC_HIGH  0x1480
+#define MVPP22_TAI_TCV0_FRAC_LOW   0x1484
+#define MVPP22_TAI_TCV1_SEC_HIGH   0x1488
+#define MVPP22_TAI_TCV1_SEC_MED0x148c
+#define MVPP22_TAI_TCV1_SEC_LOW0x1490
+#define MVPP22_TAI_TCV1_NANO_HIGH  0x1494
+#define MVPP22_TAI_TCV1_NANO_LOW   0x1498
+#define MVPP22_TAI_TCV1_FRAC_HIGH  0x149c
+#define MVPP22_TAI_TCV1_FRAC_LOW   0x14a0
+#define MVPP22_TAI_TCSR0x14a4
+#define MVPP22_TAI_TUC_LSB 0x14a8
+#define MVPP22_TAI_GFM_SEC_HIGH0x14ac
+#define MVPP22_TAI_GFM_SEC_MED 0x14b0
+#define MVPP22_TAI_GFM_SEC_LOW 0x14b4
+#define MVPP22_TAI_GFM_NANO_HIGH   0x14b8
+#define MVPP22_TAI_GFM_NANO_LOW0x14bc
+#define MVPP22_TAI_GFM_FRAC_HIGH   0x14c0
+#define MVPP22_TAI_GFM_FRAC_LOW0x14c4
+#define MVPP22_TAI_PCLK_DA_HIGH  

[PATCH net-next v2 6/6] net: mvpp2: ptp: add support for transmit timestamping

2020-09-04 Thread Russell King
Add support for timestamping transmit packets.  We allocate SYNC
messages to queue 1, every other message to queue 0.

Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h|  56 -
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 199 +-
 2 files changed, 244 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 75467411900e..834775843067 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -463,8 +464,10 @@
 #define MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE  BIT(7)
 #define MVPP22_GMAC_INT_SUM_STAT   0xa0
 #defineMVPP22_GMAC_INT_SUM_STAT_INTERNAL   BIT(1)
+#defineMVPP22_GMAC_INT_SUM_STAT_PTPBIT(2)
 #define MVPP22_GMAC_INT_SUM_MASK   0xa4
 #define MVPP22_GMAC_INT_SUM_MASK_LINK_STAT BIT(1)
+#defineMVPP22_GMAC_INT_SUM_MASK_PTPBIT(2)
 
 /* Per-port XGMAC registers. PPv2.2 only, only for GOP port 0,
  * relative to port->base.
@@ -492,9 +495,11 @@
 #define MVPP22_XLG_CTRL3_MACMODESELECT_10G (1 << 13)
 #define MVPP22_XLG_EXT_INT_STAT0x158
 #define MVPP22_XLG_EXT_INT_STAT_XLGBIT(1)
+#define MVPP22_XLG_EXT_INT_STAT_PTPBIT(7)
 #define MVPP22_XLG_EXT_INT_MASK0x15c
 #define MVPP22_XLG_EXT_INT_MASK_XLGBIT(1)
 #define MVPP22_XLG_EXT_INT_MASK_GIGBIT(2)
+#define MVPP22_XLG_EXT_INT_MASK_PTPBIT(7)
 #define MVPP22_XLG_CTRL4_REG   0x184
 #define MVPP22_XLG_CTRL4_FWD_FCBIT(5)
 #define MVPP22_XLG_CTRL4_FWD_PFC   BIT(6)
@@ -598,7 +603,11 @@
 /* PTP registers. PPv2.2 only */
 #define MVPP22_PTP_BASE(port)  (0x7800 + (port * 0x1000))
 #define MVPP22_PTP_INT_CAUSE   0x00
+#define MVPP22_PTP_INT_CAUSE_QUEUE1BIT(6)
+#define MVPP22_PTP_INT_CAUSE_QUEUE0BIT(5)
 #define MVPP22_PTP_INT_MASK0x04
+#define MVPP22_PTP_INT_MASK_QUEUE1 BIT(6)
+#define MVPP22_PTP_INT_MASK_QUEUE0 BIT(5)
 #define MVPP22_PTP_GCR 0x08
 #define MVPP22_PTP_GCR_RX_RESETBIT(13)
 #define MVPP22_PTP_GCR_TX_RESETBIT(1)
@@ -796,6 +805,43 @@ enum mvpp2_prs_l3_cast {
MVPP2_PRS_L3_BROAD_CAST
 };
 
+/* PTP descriptor constants. The low bits of the descriptor are stored
+ * separately from the high bits.
+ */
+#define MVPP22_PTP_DESC_MASK_LOW   0xfff
+
+/* PTPAction */
+enum mvpp22_ptp_action {
+   MVPP22_PTP_ACTION_NONE = 0,
+   MVPP22_PTP_ACTION_FORWARD = 1,
+   MVPP22_PTP_ACTION_CAPTURE = 3,
+   /* The following have not been verified */
+   MVPP22_PTP_ACTION_ADDTIME = 4,
+   MVPP22_PTP_ACTION_ADDCORRECTEDTIME = 5,
+   MVPP22_PTP_ACTION_CAPTUREADDTIME = 6,
+   MVPP22_PTP_ACTION_CAPTUREADDCORRECTEDTIME = 7,
+   MVPP22_PTP_ACTION_ADDINGRESSTIME = 8,
+   MVPP22_PTP_ACTION_CAPTUREADDINGRESSTIME = 9,
+   MVPP22_PTP_ACTION_CAPTUREINGRESSTIME = 10,
+};
+
+/* PTPPacketFormat */
+enum mvpp22_ptp_packet_format {
+   MVPP22_PTP_PKT_FMT_PTPV2 = 0,
+   MVPP22_PTP_PKT_FMT_PTPV1 = 1,
+   MVPP22_PTP_PKT_FMT_Y1731 = 2,
+   MVPP22_PTP_PKT_FMT_NTPTS = 3,
+   MVPP22_PTP_PKT_FMT_NTPRX = 4,
+   MVPP22_PTP_PKT_FMT_NTPTX = 5,
+   MVPP22_PTP_PKT_FMT_TWAMP = 6,
+};
+
+#define MVPP22_PTP_ACTION(x)   (((x) & 15) << 0)
+#define MVPP22_PTP_PACKETFORMAT(x) (((x) & 7) << 4)
+#define MVPP22_PTP_MACTIMESTAMPINGEN   BIT(11)
+#define MVPP22_PTP_TIMESTAMPENTRYID(x) (((x) & 31) << 12)
+#define MVPP22_PTP_TIMESTAMPQUEUESELECTBIT(18)
+
 /* BM constants */
 #define MVPP2_BM_JUMBO_BUF_NUM 512
 #define MVPP2_BM_LONG_BUF_NUM  1024
@@ -1014,6 +1060,11 @@ struct mvpp2_ethtool_fs {
struct ethtool_rxnfc rxnfc;
 };
 
+struct mvpp2_hwtstamp_queue {
+   struct sk_buff *skb[32];
+   u8 next;
+};
+
 struct mvpp2_port {
u8 id;
 
@@ -1100,6 +1151,8 @@ struct mvpp2_port {
 
bool hwtstamp;
bool rx_hwtstamp;
+   enum hwtstamp_tx_types tx_hwtstamp_type;
+   struct mvpp2_hwtstamp_queue tx_hwtstamp_queue[2];
 };
 
 /* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
@@ -1168,7 +1221,8 @@ struct mvpp22_tx_desc {
u8  packet_offset;
u8  phys_txq;
__le16 data_size;
-   __le64 reserved1;
+   __le32 ptp_descriptor;
+   __le32 reserved2;
__le64 buf_dma_addr_ptp;
__le64 buf_cookie_misc;
 };
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index c20fde0fc73c..7130e31c7431 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drive

[PATCH net-next v2 3/6] net: mvpp2: check first level interrupt status registers

2020-09-04 Thread Russell King
Check the first level interrupt status registers to determine how to
further process the port interrupt. We will need this to know whether
to invoke the link status processing and/or the PTP processing for
both XLG and GMAC.

Reviewed-by: Andrew Lunn 
Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h  |  4 
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 13 +++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index a2f787c83756..273c46bbf927 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -461,6 +461,8 @@
 #define MVPP22_CTRL4_DP_CLK_SELBIT(5)
 #define MVPP22_CTRL4_SYNC_BYPASS_DIS   BIT(6)
 #define MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE  BIT(7)
+#define MVPP22_GMAC_INT_SUM_STAT   0xa0
+#defineMVPP22_GMAC_INT_SUM_STAT_INTERNAL   BIT(1)
 #define MVPP22_GMAC_INT_SUM_MASK   0xa4
 #define MVPP22_GMAC_INT_SUM_MASK_LINK_STAT BIT(1)
 
@@ -488,6 +490,8 @@
 #define MVPP22_XLG_CTRL3_MACMODESELECT_MASK(7 << 13)
 #define MVPP22_XLG_CTRL3_MACMODESELECT_GMAC(0 << 13)
 #define MVPP22_XLG_CTRL3_MACMODESELECT_10G (1 << 13)
+#define MVPP22_XLG_EXT_INT_STAT0x158
+#define MVPP22_XLG_EXT_INT_STAT_XLGBIT(1)
 #define MVPP22_XLG_EXT_INT_MASK0x15c
 #define MVPP22_XLG_EXT_INT_MASK_XLGBIT(1)
 #define MVPP22_XLG_EXT_INT_MASK_GIGBIT(2)
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 41ffae8d5357..9dc8cf3d0873 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3039,14 +3039,23 @@ static void mvpp2_isr_handle_gmac_internal(struct 
mvpp2_port *port)
 static irqreturn_t mvpp2_port_isr(int irq, void *dev_id)
 {
struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
+   u32 val;
 
mvpp22_gop_mask_irq(port);
 
if (mvpp2_port_supports_xlg(port) &&
mvpp2_is_xlg(port->phy_interface)) {
-   mvpp2_isr_handle_xlg(port);
+   /* Check the external status register */
+   val = readl(port->base + MVPP22_XLG_EXT_INT_STAT);
+   if (val & MVPP22_XLG_EXT_INT_STAT_XLG)
+   mvpp2_isr_handle_xlg(port);
} else {
-   mvpp2_isr_handle_gmac_internal(port);
+   /* If it's not the XLG, we must be using the GMAC.
+* Check the summary status.
+*/
+   val = readl(port->base + MVPP22_GMAC_INT_SUM_STAT);
+   if (val & MVPP22_GMAC_INT_SUM_STAT_INTERNAL)
+   mvpp2_isr_handle_gmac_internal(port);
}
 
mvpp22_gop_unmask_irq(port);
-- 
2.20.1



[PATCH net-next v2 1/6] net: mvpp2: restructure "link status" interrupt handling

2020-09-04 Thread Russell King
The "link status" interrupt is used for more than just link status.
Restructure mvpp2_link_status_isr() so we can add additional handling.

Reviewed-by: Andrew Lunn 
Signed-off-by: Russell King 
---
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 83 ---
 1 file changed, 51 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index d0bbe3a64b8d..81473911a822 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -2974,44 +2974,17 @@ static irqreturn_t mvpp2_isr(int irq, void *dev_id)
return IRQ_HANDLED;
 }
 
-/* Per-port interrupt for link status changes */
-static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
+static void mvpp2_isr_handle_link(struct mvpp2_port *port, bool link)
 {
-   struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
struct net_device *dev = port->dev;
-   bool event = false, link = false;
-   u32 val;
-
-   mvpp22_gop_mask_irq(port);
-
-   if (mvpp2_port_supports_xlg(port) &&
-   mvpp2_is_xlg(port->phy_interface)) {
-   val = readl(port->base + MVPP22_XLG_INT_STAT);
-   if (val & MVPP22_XLG_INT_STAT_LINK) {
-   event = true;
-   val = readl(port->base + MVPP22_XLG_STATUS);
-   if (val & MVPP22_XLG_STATUS_LINK_UP)
-   link = true;
-   }
-   } else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-  phy_interface_mode_is_8023z(port->phy_interface) ||
-  port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
-   val = readl(port->base + MVPP22_GMAC_INT_STAT);
-   if (val & MVPP22_GMAC_INT_STAT_LINK) {
-   event = true;
-   val = readl(port->base + MVPP2_GMAC_STATUS0);
-   if (val & MVPP2_GMAC_STATUS0_LINK_UP)
-   link = true;
-   }
-   }
 
if (port->phylink) {
phylink_mac_change(port->phylink, link);
-   goto handled;
+   return;
}
 
-   if (!netif_running(dev) || !event)
-   goto handled;
+   if (!netif_running(dev))
+   return;
 
if (link) {
mvpp2_interrupts_enable(port);
@@ -3028,8 +3001,54 @@ static irqreturn_t mvpp2_link_status_isr(int irq, void 
*dev_id)
 
mvpp2_interrupts_disable(port);
}
+}
+
+static void mvpp2_isr_handle_xlg(struct mvpp2_port *port)
+{
+   bool link;
+   u32 val;
+
+   val = readl(port->base + MVPP22_XLG_INT_STAT);
+   if (val & MVPP22_XLG_INT_STAT_LINK) {
+   val = readl(port->base + MVPP22_XLG_STATUS);
+   if (val & MVPP22_XLG_STATUS_LINK_UP)
+   link = true;
+   mvpp2_isr_handle_link(port, link);
+   }
+}
+
+static void mvpp2_isr_handle_gmac_internal(struct mvpp2_port *port)
+{
+   bool link;
+   u32 val;
+
+   if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+   phy_interface_mode_is_8023z(port->phy_interface) ||
+   port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+   val = readl(port->base + MVPP22_GMAC_INT_STAT);
+   if (val & MVPP22_GMAC_INT_STAT_LINK) {
+   val = readl(port->base + MVPP2_GMAC_STATUS0);
+   if (val & MVPP2_GMAC_STATUS0_LINK_UP)
+   link = true;
+   mvpp2_isr_handle_link(port, link);
+   }
+   }
+}
+
+/* Per-port interrupt for link status changes */
+static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
+{
+   struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
+
+   mvpp22_gop_mask_irq(port);
+
+   if (mvpp2_port_supports_xlg(port) &&
+   mvpp2_is_xlg(port->phy_interface)) {
+   mvpp2_isr_handle_xlg(port);
+   } else {
+   mvpp2_isr_handle_gmac_internal(port);
+   }
 
-handled:
mvpp22_gop_unmask_irq(port);
return IRQ_HANDLED;
 }
-- 
2.20.1



[PATCH net-next v2 5/6] net: mvpp2: ptp: add support for receive timestamping

2020-09-04 Thread Russell King
Add support for receive timestamping. When enabled, the hardware adds
a timestamp into the receive queue descriptor for all received packets
with no filtering. Hence, we can only support NONE or ALL receive
filter modes.

The timestamp in the receive queue contains two bit sof seconds and
the full nanosecond timestamp. This has to be merged with the remainder
of the seconds from the TAI clock to arrive at a full timestamp before
we can convert it to a ktime for the skb hardware timestamp field.

Signed-off-by: Russell King 
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h|  31 -
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 108 +-
 .../net/ethernet/marvell/mvpp2/mvpp2_tai.c|  53 -
 3 files changed, 189 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index b9fae3870393..75467411900e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -600,6 +600,9 @@
 #define MVPP22_PTP_INT_CAUSE   0x00
 #define MVPP22_PTP_INT_MASK0x04
 #define MVPP22_PTP_GCR 0x08
+#define MVPP22_PTP_GCR_RX_RESETBIT(13)
+#define MVPP22_PTP_GCR_TX_RESETBIT(1)
+#define MVPP22_PTP_GCR_TSU_ENABLE  BIT(0)
 #define MVPP22_PTP_TX_Q0_R00x0c
 #define MVPP22_PTP_TX_Q0_R10x10
 #define MVPP22_PTP_TX_Q0_R20x14
@@ -1094,6 +1097,9 @@ struct mvpp2_port {
 * them from 0
 */
int rss_ctx[MVPP22_N_RSS_TABLES];
+
+   bool hwtstamp;
+   bool rx_hwtstamp;
 };
 
 /* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
@@ -1173,7 +1179,7 @@ struct mvpp22_rx_desc {
__le16 reserved1;
__le16 data_size;
__le32 reserved2;
-   __le32 reserved3;
+   __le32 timestamp;
__le64 buf_dma_addr_key_hash;
__le64 buf_cookie_misc;
 };
@@ -1355,11 +1361,34 @@ void mvpp2_dbgfs_cleanup(struct mvpp2 *priv);
 
 #ifdef CONFIG_MVPP2_PTP
 int mvpp22_tai_probe(struct device *dev, struct mvpp2 *priv);
+void mvpp22_tai_tstamp(struct mvpp2_tai *tai, u32 tstamp,
+  struct skb_shared_hwtstamps *hwtstamp);
+void mvpp22_tai_start(struct mvpp2_tai *tai);
+void mvpp22_tai_stop(struct mvpp2_tai *tai);
+int mvpp22_tai_ptp_clock_index(struct mvpp2_tai *tai);
 #else
 static inline int mvpp22_tai_probe(struct device *dev, struct mvpp2 *priv)
 {
return 0;
 }
+static inline void mvpp22_tai_tstamp(struct mvpp2_tai *tai, u32 tstamp,
+struct skb_shared_hwtstamps *hwtstamp)
+{
+}
+static inline void mvpp22_tai_start(struct mvpp2_tai *tai)
+{
+}
+static inline void mvpp22_tai_stop(struct mvpp2_tai *tai)
+{
+}
+static inline int mvpp22_tai_ptp_clock_index(struct mvpp2_tai *tai)
+{
+   return -1;
+}
 #endif
 
+static inline bool mvpp22_rx_hwtstamping(struct mvpp2_port *port)
+{
+   return IS_ENABLED(CONFIG_MVPP2_PTP) && port->rx_hwtstamp;
+}
 #endif
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index d064e4b20df0..c20fde0fc73c 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -3449,7 +3449,7 @@ static int mvpp2_rx(struct mvpp2_port *port, struct 
napi_struct *napi,
unsigned int frag_size;
dma_addr_t dma_addr;
phys_addr_t phys_addr;
-   u32 rx_status;
+   u32 rx_status, timestamp;
int pool, rx_bytes, err, ret;
void *data;
 
@@ -3527,6 +3527,15 @@ static int mvpp2_rx(struct mvpp2_port *port, struct 
napi_struct *napi,
goto err_drop_frame;
}
 
+   /* If we have RX hardware timestamping enabled, grab the
+* timestamp from the queue and convert.
+*/
+   if (mvpp22_rx_hwtstamping(port)) {
+   timestamp = le32_to_cpu(rx_desc->pp22.timestamp);
+   mvpp22_tai_tstamp(port->priv->tai, timestamp,
+skb_hwtstamps(skb));
+   }
+
err = mvpp2_rx_refill(port, bm_pool, pp, pool);
if (err) {
netdev_err(port->dev, "failed to refill BM pools\n");
@@ -4561,10 +4570,100 @@ mvpp2_get_stats64(struct net_device *dev, struct 
rtnl_link_stats64 *stats)
stats->tx_dropped   = dev->stats.tx_dropped;
 }
 
+static int mvpp2_set_ts_config(struct mvpp2_port *port, struct ifreq *ifr)
+{
+   struct hwtstamp_config config;
+   void __iomem *ptp;
+
+   if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+   return -EFAULT;
+
+   if (config.flags)
+   return -EINVAL;
+
+   if (config.tx_type != HWTSTAMP_TX_OFF)
+  

Re: [PATCH v2 net-next 2/9] xdp: initialize xdp_buff mb bit to 0 in all XDP drivers

2020-09-04 Thread Jesper Dangaard Brouer
On Thu,  3 Sep 2020 22:58:46 +0200
Lorenzo Bianconi  wrote:

> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 0b675c34ce49..20c8fd3cd4a3 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -2298,6 +2298,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
> *q_vector,
>  #if (PAGE_SIZE < 8192)
>   xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0);
>  #endif
> + xdp.mb = 0;
>  
>   while (likely(total_rx_packets < budget)) {
>   union ixgbe_adv_rx_desc *rx_desc;

In this ixgbe driver you are smart and init the xdp.mb bit outside the
(like xdp.frame_sz, when frame_sz is constant).   This is a nice
optimization, but the driver developer that adds XDP multi-buffer
support must remember to reset it.  The patch itself is okay, it is
just something to keep in-mind when reviewing/changing drivers.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer



Re: [PATCH v2 net-next 0/9] mvneta: introduce XDP multi-buffer support

2020-09-04 Thread Lorenzo Bianconi
> Lorenzo Bianconi wrote:
> > - Finalize XDP multi-buffer support for mvneta driver introducing the
> >   capability to map non-linear buffers on tx side.
> > - Introduce multi-buffer bit (mb) in xdp_frame/xdp_buffer to specify if
> >   shared_info area has been properly initialized.
> > - Initialize multi-buffer bit (mb) to 0 in all XDP-capable drivers.
> > - Add multi-buff support to xdp_return_{buff/frame} utility routines.
> > - Introduce bpf_xdp_adjust_mb_header helper to adjust frame headers moving
> >   *offset* bytes from/to the second buffer to/from the first one.
> >   This helper can be used to move headers when the hw DMA SG is not able
> >   to copy all the headers in the first fragment and split header and data
> >   pages. A possible use case for bpf_xdp_adjust_mb_header is described
> >   here [0]
> 
> Are those slides available anywhere? [0] is just a link to the abstract.

Yes, sorry. I would point out where we got the idea for this helper.
I do not think the slides are available yet but I guess they will be soon.

> 
> > - Introduce bpf_xdp_get_frag_count and bpf_xdp_get_frags_total_size helpers 
> > to
> >   report the total number/size of frags for a given xdp multi-buff.
> > 
> > XDP multi-buffer design principles are described here [1]
> > For the moment we have not implemented any self-test for the introduced the 
> > bpf
> > helpers. We can address this in a follow up series if the proposed approach
> > is accepted.
> 
> Will need to include selftests with series.

Sure, I will add selftests in v3.

Regards,
Lorenzo

> 
> > 
> > Changes since v1:
> > - Fix use-after-free in xdp_return_{buff/frame}
> > - Introduce bpf helpers
> > - Introduce xdp_mb sample program
> > - access skb_shared_info->nr_frags only on the last fragment
> > 
> > Changes since RFC:
> > - squash multi-buffer bit initialization in a single patch
> > - add mvneta non-linear XDP buff support for tx side
> > 
> > [0] 
> > https://netdevconf.info/0x14/session.html?talk-the-path-to-tcp-4k-mtu-and-rx-zerocopy
> > [1] 
> > https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
> > 
> > Lorenzo Bianconi (7):
> >   xdp: introduce mb in xdp_buff/xdp_frame
> >   xdp: initialize xdp_buff mb bit to 0 in all XDP drivers
> >   net: mvneta: update mb bit before passing the xdp buffer to eBPF layer
> >   xdp: add multi-buff support to xdp_return_{buff/frame}
> >   net: mvneta: add multi buffer support to XDP_TX
> >   bpf: helpers: add bpf_xdp_adjust_mb_header helper
> >   net: mvneta: enable jumbo frames for XDP
> > 
> > Sameeh Jubran (2):
> >   bpf: helpers: add multibuffer support
> >   samples/bpf: add bpf program that uses xdp mb helpers
> > 
> >  drivers/net/ethernet/amazon/ena/ena_netdev.c  |   1 +
> >  drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |   1 +
> >  .../net/ethernet/cavium/thunder/nicvf_main.c  |   1 +
> >  .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  |   1 +
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   1 +
> >  drivers/net/ethernet/intel/ice/ice_txrx.c |   1 +
> >  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   1 +
> >  .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |   1 +
> >  drivers/net/ethernet/marvell/mvneta.c | 126 ++--
> >  .../net/ethernet/marvell/mvpp2/mvpp2_main.c   |   1 +
> >  drivers/net/ethernet/mellanox/mlx4/en_rx.c|   1 +
> >  .../net/ethernet/mellanox/mlx5/core/en_rx.c   |   1 +
> >  .../ethernet/netronome/nfp/nfp_net_common.c   |   1 +
> >  drivers/net/ethernet/qlogic/qede/qede_fp.c|   1 +
> >  drivers/net/ethernet/sfc/rx.c |   1 +
> >  drivers/net/ethernet/socionext/netsec.c   |   1 +
> >  drivers/net/ethernet/ti/cpsw.c|   1 +
> >  drivers/net/ethernet/ti/cpsw_new.c|   1 +
> >  drivers/net/hyperv/netvsc_bpf.c   |   1 +
> >  drivers/net/tun.c |   2 +
> >  drivers/net/veth.c|   1 +
> >  drivers/net/virtio_net.c  |   2 +
> >  drivers/net/xen-netfront.c|   1 +
> >  include/net/xdp.h |  26 ++-
> >  include/uapi/linux/bpf.h  |  39 +++-
> >  net/core/dev.c|   1 +
> >  net/core/filter.c |  93 +
> >  net/core/xdp.c|  40 
> >  samples/bpf/Makefile  |   3 +
> >  samples/bpf/xdp_mb_kern.c |  68 +++
> >  samples/bpf/xdp_mb_user.c | 182 ++
> >  tools/include/uapi/linux/bpf.h|  40 +++-
> >  32 files changed, 572 insertions(+), 70 deletions(-)
> >  create mode 100644 samples/bpf/xdp_mb_kern.c
> >  create mode 100644 samples/bpf/xdp_mb_user.c
> > 
> > -- 
> > 2.26.2
> > 
> 
> 


signature.asc
Description: PGP signature


Re: [PATCH v2 net-next 0/9] mvneta: introduce XDP multi-buffer support

2020-09-04 Thread Lorenzo Bianconi
> On Thu, Sep 03, 2020 at 10:58:44PM +0200, Lorenzo Bianconi wrote:
> > For the moment we have not implemented any self-test for the introduced the 
> > bpf
> > helpers. We can address this in a follow up series if the proposed approach
> > is accepted.
> 
> selftest has to be part of the same patch set.

sure, I will add it in v3.

Regards,
Lorenzo


signature.asc
Description: PGP signature


Re: [PATCH v2 net-next 6/9] bpf: helpers: add bpf_xdp_adjust_mb_header helper

2020-09-04 Thread Lorenzo Bianconi
> On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > +  int, offset)
> > +{
> > +   void *data_hard_end, *data_end;
> > +   struct skb_shared_info *sinfo;
> > +   int frag_offset, frag_len;
> > +   u8 *addr;
> > +
> > +   if (!xdp->mb)
> > +   return -EOPNOTSUPP;
> > +
> > +   sinfo = xdp_get_shared_info_from_buff(xdp);
> > +
> > +   frag_len = skb_frag_size(&sinfo->frags[0]);
> > +   if (offset > frag_len)
> > +   return -EINVAL;
> > +
> > +   frag_offset = skb_frag_off(&sinfo->frags[0]);
> > +   data_end = xdp->data_end + offset;
> > +
> > +   if (offset < 0 && (-offset > frag_offset ||
> > +  data_end < xdp->data + ETH_HLEN))
> > +   return -EINVAL;
> > +
> > +   data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > +   if (data_end > data_hard_end)
> > +   return -EINVAL;
> > +
> > +   addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > +   if (offset > 0) {
> > +   memcpy(xdp->data_end, addr, offset);
> > +   } else {
> > +   memcpy(addr + offset, xdp->data_end + offset, -offset);
> > +   memset(xdp->data_end + offset, 0, -offset);
> > +   }
> > +
> > +   skb_frag_size_sub(&sinfo->frags[0], offset);
> > +   skb_frag_off_add(&sinfo->frags[0], offset);
> > +   xdp->data_end = data_end;
> > +
> > +   return 0;
> > +}
> 
> wait a sec. Are you saying that multi buffer XDP actually should be skb based?
> If that's what mvneta driver is doing that's fine, but that is not a
> reasonable requirement to put on all other drivers.

I did not got what you mean here. The xdp multi-buffer layout uses the 
skb_shared_info
at the end of the first buffer to link subsequent frames [0] and we rely on 
skb_frag*
utilities to set/read offset and length of subsequent buffers.

Regards,
Lorenzo

[0] 
http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html
 - XDP multi-buffers section


signature.asc
Description: PGP signature


[net-next:master 68/116] ip_tunnel.c:undefined reference to `sysctl_fb_tunnels_only_for_init_net'

2020-09-04 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   3ab1270bfa1e8ae7db0d46fee90c5db2935ac91b
commit: 316cdaa1158af17250397054f92bb339fbd8e282 [68/116] net: add option to 
not create fall-back tunnels in root-ns as well
config: m68k-amcore_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 316cdaa1158af17250397054f92bb339fbd8e282
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=m68k 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   m68k-linux-ld: net/ipv4/ip_tunnel.o: in function `ip_tunnel_init_net':
>> ip_tunnel.c:(.text+0x1be2): undefined reference to 
>> `sysctl_fb_tunnels_only_for_init_net'
   m68k-linux-ld: net/ipv6/sit.o: in function `sit_init_net':
>> sit.c:(.init.text+0x104): undefined reference to 
>> `sysctl_fb_tunnels_only_for_init_net'
>> m68k-linux-ld: sit.c:(.init.text+0x10c): undefined reference to 
>> `sysctl_fb_tunnels_only_for_init_net'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH v3 bpf-next 14/14] selftests/bpf: add __noinline variant of cls_redirect selftest

2020-09-04 Thread Lorenz Bauer
On Thu, 3 Sep 2020 at 21:36, Andrii Nakryiko  wrote:
>
> As one of the most complicated and close-to-real-world programs, cls_redirect
> is a good candidate to exercise libbpf's logic of handling bpf2bpf calls. So
> add variant with using explicit __noinline for majority of functions except
> few most basic ones. If those few functions are inlined, verifier starts to
> complain about program instruction limit of 1mln instructions being exceeded,
> most probably due to instruction overhead of doing a sub-program call.
> Convert user-space part of selftest to have to sub-tests: with and without
> inlining.

Acked-by: Lorenz Bauer 

>
> Cc: Lorenz Bauer 
> Signed-off-by: Andrii Nakryiko 
> ---
>  .../selftests/bpf/prog_tests/cls_redirect.c   |  72 +---
>  .../selftests/bpf/progs/test_cls_redirect.c   | 105 ++
>  .../bpf/progs/test_cls_redirect_subprogs.c|   2 +
>  3 files changed, 115 insertions(+), 64 deletions(-)
>  create mode 100644 
> tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c 
> b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
> index f259085cca6a..9781d85cb223 100644
> --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
> +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
> @@ -12,10 +12,13 @@
>
>  #include "progs/test_cls_redirect.h"
>  #include "test_cls_redirect.skel.h"
> +#include "test_cls_redirect_subprogs.skel.h"
>
>  #define ENCAP_IP INADDR_LOOPBACK
>  #define ENCAP_PORT (1234)
>
> +static int duration = 0;
> +
>  struct addr_port {
> in_port_t port;
> union {
> @@ -361,30 +364,18 @@ static void close_fds(int *fds, int n)
> close(fds[i]);
>  }
>
> -void test_cls_redirect(void)
> +static void test_cls_redirect_common(struct bpf_program *prog)
>  {
> -   struct test_cls_redirect *skel = NULL;
> struct bpf_prog_test_run_attr tattr = {};
> int families[] = { AF_INET, AF_INET6 };
> struct sockaddr_storage ss;
> struct sockaddr *addr;
> socklen_t slen;
> int i, j, err;
> -
> int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
> int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
> struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
>
> -   skel = test_cls_redirect__open();
> -   if (CHECK_FAIL(!skel))
> -   return;
> -
> -   skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
> -   skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
> -
> -   if (CHECK_FAIL(test_cls_redirect__load(skel)))
> -   goto cleanup;
> -
> addr = (struct sockaddr *)&ss;
> for (i = 0; i < ARRAY_SIZE(families); i++) {
> slen = prepare_addr(&ss, families[i]);
> @@ -402,7 +393,7 @@ void test_cls_redirect(void)
> goto cleanup;
> }
>
> -   tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
> +   tattr.prog_fd = bpf_program__fd(prog);
> for (i = 0; i < ARRAY_SIZE(tests); i++) {
> struct test_cfg *test = &tests[i];
>
> @@ -450,7 +441,58 @@ void test_cls_redirect(void)
> }
>
>  cleanup:
> -   test_cls_redirect__destroy(skel);
> close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
> close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
>  }
> +
> +static void test_cls_redirect_inlined(void)
> +{
> +   struct test_cls_redirect *skel;
> +   int err;
> +
> +   skel = test_cls_redirect__open();
> +   if (CHECK(!skel, "skel_open", "failed\n"))
> +   return;
> +
> +   skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
> +   skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
> +
> +   err = test_cls_redirect__load(skel);
> +   if (CHECK(err, "skel_load", "failed: %d\n", err))
> +   goto cleanup;
> +
> +   test_cls_redirect_common(skel->progs.cls_redirect);
> +
> +cleanup:
> +   test_cls_redirect__destroy(skel);
> +}
> +
> +static void test_cls_redirect_subprogs(void)
> +{
> +   struct test_cls_redirect_subprogs *skel;
> +   int err;
> +
> +   skel = test_cls_redirect_subprogs__open();
> +   if (CHECK(!skel, "skel_open", "failed\n"))
> +   return;
> +
> +   skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
> +   skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
> +
> +   err = test_cls_redirect_subprogs__load(skel);
> +   if (CHECK(err, "skel_load", "failed: %d\n", err))
> +   goto cleanup;
> +
> +   test_cls_redirect_common(skel->progs.cls_redirect);
> +
> +cleanup:
> +   test_cls_redirect_subprogs__destroy(skel);
> +}
> +
> +void test_cls_redirect(void)
> +{
> +   if (test__start_subtest("cls_redirect_inlined"))
> +   test_cls_redirect_inlined();
> +   if (test__start_subtest("cls_redirect_subprogs"))
> +   test_cls_redirect_subpro

Re: [PATCH v2 net-next 2/9] xdp: initialize xdp_buff mb bit to 0 in all XDP drivers

2020-09-04 Thread Lorenzo Bianconi
> On Thu,  3 Sep 2020 22:58:46 +0200
> Lorenzo Bianconi  wrote:
> 
> > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
> > b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > index 0b675c34ce49..20c8fd3cd4a3 100644
> > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > @@ -2298,6 +2298,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector 
> > *q_vector,
> >  #if (PAGE_SIZE < 8192)
> > xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0);
> >  #endif
> > +   xdp.mb = 0;
> >  
> > while (likely(total_rx_packets < budget)) {
> > union ixgbe_adv_rx_desc *rx_desc;
> 
> In this ixgbe driver you are smart and init the xdp.mb bit outside the
> (like xdp.frame_sz, when frame_sz is constant).   This is a nice
> optimization, but the driver developer that adds XDP multi-buffer
> support must remember to reset it.  The patch itself is okay, it is
> just something to keep in-mind when reviewing/changing drivers.

yes, I have just decided to avoid unnecessary instructions for the moment.

Regards,
Lorenzo

> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
> 


signature.asc
Description: PGP signature


Re: [PATCH v2 net-next 6/9] bpf: helpers: add bpf_xdp_adjust_mb_header helper

2020-09-04 Thread Lorenzo Bianconi
On Sep 03, Alexei Starovoitov wrote:
> On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> > Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> > headers moving *offset* bytes from/to the second buffer to/from the
> > first one.
> > This helper can be used to move headers when the hw DMA SG is not able
> > to copy all the headers in the first fragment and split header and data
> > pages.
> > 
> > Signed-off-by: Lorenzo Bianconi 
> > ---
> >  include/uapi/linux/bpf.h   | 25 
> >  net/core/filter.c  | 54 ++
> >  tools/include/uapi/linux/bpf.h | 26 
> >  3 files changed, 95 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8dda13880957..c4a6d245619c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3571,11 +3571,25 @@ union bpf_attr {
> >   * value.
> >   *
> >   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> > - * Description
> > - * Read *size* bytes from user space address *user_ptr* 
> > and store
> > - * the data in *dst*. This is a wrapper of 
> > copy_from_user().
> > - * Return
> > - * 0 on success, or a negative error in case of failure.
> > + * Description
> > + * Read *size* bytes from user space address *user_ptr* and store
> > + * the data in *dst*. This is a wrapper of copy_from_user().
> > + *
> > + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> 
> botched rebase?

Yes, sorry. I will fix in v3.

Regards,
Lorenzo


signature.asc
Description: PGP signature


Re: [PATCH net-next] net: sch_generic: aviod concurrent reset and enqueue op for lockless qdisc

2020-09-04 Thread Yunsheng Lin
On 2020/9/4 9:30, John Fastabend wrote:
> Cong Wang wrote:
>> On Wed, Sep 2, 2020 at 7:22 PM Yunsheng Lin  wrote:
>>>
>>> On 2020/9/3 9:48, Cong Wang wrote:
 On Wed, Sep 2, 2020 at 6:22 PM Yunsheng Lin  wrote:
>
> On 2020/9/3 8:35, Cong Wang wrote:
>> On Tue, Sep 1, 2020 at 11:35 PM Yunsheng Lin  
>> wrote:
>>>
>>> On 2020/9/2 12:41, Cong Wang wrote:
 On Tue, Sep 1, 2020 at 6:42 PM Yunsheng Lin  
 wrote:
>
> On 2020/9/2 2:24, Cong Wang wrote:
>> On Mon, Aug 31, 2020 at 5:59 PM Yunsheng Lin 
>>  wrote:
>>>
>>> Currently there is concurrent reset and enqueue operation for the
>>> same lockless qdisc when there is no lock to synchronize the
>>> q->enqueue() in __dev_xmit_skb() with the qdisc reset operation in
>>> qdisc_deactivate() called by dev_deactivate_queue(), which may cause
>>> out-of-bounds access for priv->ring[] in hns3 driver if user has
>>> requested a smaller queue num when __dev_xmit_skb() still enqueue a
>>> skb with a larger queue_mapping after the corresponding qdisc is
>>> reset, and call hns3_nic_net_xmit() with that skb later.
>>
>> Can you be more specific here? Which call path requests a smaller
>> tx queue num? If you mean netif_set_real_num_tx_queues(), clearly
>> we already have a synchronize_net() there.
>
> When the netdevice is in active state, the synchronize_net() seems to
> do the correct work, as below:
>
> CPU 0:   CPU1:
> __dev_queue_xmit()   
> netif_set_real_num_tx_queues()
> rcu_read_lock_bh();
> netdev_core_pick_tx(dev, skb, sb_dev);
> .
> .   dev->real_num_tx_queues = txq;
> .   .
> .   .
> .   synchronize_net();
> .   .
> q->enqueue().
> .   .
> rcu_read_unlock_bh().
> qdisc_reset_all_tx_gt
>
>

 Right.


> but dev->real_num_tx_queues is not RCU-protected, maybe that is a 
> problem
> too.
>
> The problem we hit is as below:
> In hns3_set_channels(), hns3_reset_notify(h, HNAE3_DOWN_CLIENT) is 
> called
> to deactive the netdevice when user requested a smaller queue num, and
> txq->qdisc is already changed to noop_qdisc when calling
> netif_set_real_num_tx_queues(), so the synchronize_net() in the 
> function
> netif_set_real_num_tx_queues() does not help here.

 How could qdisc still be running after deactivating the device?
>>>
>>> qdisc could be running during the device deactivating process.
>>>
>>> The main process of changing channel number is as below:
>>>
>>> 1. dev_deactivate()
>>> 2. hns3 handware related setup
>>> 3. netif_set_real_num_tx_queues()
>>> 4. netif_tx_wake_all_queues()
>>> 5. dev_activate()
>>>
>>> During step 1, qdisc could be running while qdisc is resetting, so
>>> there could be skb left in the old qdisc(which will be restored back to
>>> txq->qdisc during dev_activate()), as below:
>>>
>>> CPU 0:   CPU1:
>>> __dev_queue_xmit():  dev_deactivate_many():
>>> rcu_read_lock_bh();  qdisc_deactivate(qdisc);
>>> q = rcu_dereference_bh(txq->qdisc); .
>>> netdev_core_pick_tx(dev, skb, sb_dev);  .
>>> .
>>> .   
>>> rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
>>> .   .
>>> .   .
>>> .   .
>>> .   .
>>> q->enqueue().
>>
>>
>> Well, like I said, if the deactivated bit were tested before ->enqueue(),
>> there would be no packet queued after qdisc_deactivate().
> 
> Trying to unwind this through git history :/
> 
> Original code had a test_bit in dev_xmit_skb(),
> 
>   if (q->flags & TCQ_F_NOLOCK) {
>   if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
>   __qdisc_drop(skb, &to_free);
>   rc = NET_XMIT_DROP;
>   } else {
>   rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
>   

Re: [PATCH net-next] net: sch_generic: aviod concurrent reset and enqueue op for lockless qdisc

2020-09-04 Thread Yunsheng Lin
On 2020/9/3 15:24, Eric Dumazet wrote:
> 
> 
> On 9/2/20 6:14 PM, Yunsheng Lin wrote:
> 
>>
>> It seems semantics for some_qdisc_is_busy() is changed, which does not only 
>> do
>> the checking, but also do the reseting?
> 
> Yes, obviously, we would have to rename to a better name.
> 
>>
>> Also, qdisc_reset() could be called multi times for the same qdisc if 
>> some_qdisc_is_busy()
>> return true multi times?
> 
> This should not matter, qdisc_reset() can be called multiple times,
> as we also call it from qdisc_destroy() anyway.

How about the below patch, which does not need to change the semantics
for some_qdisc_is_busy() and avoid calling qdisc_reset() multi times?


diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 265a61d..ce9031c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1131,24 +1131,7 @@ EXPORT_SYMBOL(dev_activate);

 static void qdisc_deactivate(struct Qdisc *qdisc)
 {
-   bool nolock = qdisc->flags & TCQ_F_NOLOCK;
-
-   if (qdisc->flags & TCQ_F_BUILTIN)
-   return;
-   if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state))
-   return;
-
-   if (nolock)
-   spin_lock_bh(&qdisc->seqlock);
-   spin_lock_bh(qdisc_lock(qdisc));
-
set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
-
-   qdisc_reset(qdisc);
-
-   spin_unlock_bh(qdisc_lock(qdisc));
-   if (nolock)
-   spin_unlock_bh(&qdisc->seqlock);
 }

 static void dev_deactivate_queue(struct net_device *dev,
@@ -1165,6 +1148,33 @@ static void dev_deactivate_queue(struct net_device *dev,
}
 }

+static void dev_reset_qdisc(struct net_device *dev)
+{
+   unsigned int i;
+
+   for (i = 0; i < dev->num_tx_queues; i++) {
+   struct netdev_queue *dev_queue;
+   struct Qdisc *q;
+   bool nolock;
+
+   dev_queue = netdev_get_tx_queue(dev, i);
+   q = dev_queue->qdisc_sleeping;
+   nolock = q->flags & TCQ_F_NOLOCK;
+
+   if (nolock)
+   spin_lock_bh(&q->seqlock);
+
+   spin_lock_bh(qdisc_lock(q));
+
+   qdisc_reset(q);
+
+   spin_unlock_bh(qdisc_lock(q));
+
+   if (nolock)
+   spin_unlock_bh(&q->seqlock);
+   }
+}
+
 static bool some_qdisc_is_busy(struct net_device *dev)
 {
unsigned int i;
@@ -1219,6 +1229,9 @@ void dev_deactivate_many(struct list_head *head)
 */
synchronize_net();

+   list_for_each_entry(dev, head, close_list)
+   dev_reset_qdisc(dev);
+
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
while (some_qdisc_is_busy(dev)) {



> 
> 


[PATCH v2] net: dsa: microchip: look for phy-mode in port nodes

2020-09-04 Thread Helmut Grohne
Documentation/devicetree/bindings/net/dsa/dsa.txt says that the phy-mode
property should be specified on port nodes. However, the microchip
drivers read it from the switch node.

Let the driver use the per-port property and fall back to the old
location with a warning.

Fix in-tree users.

Signed-off-by: Helmut Grohne 
Link: https://lore.kernel.org/netdev/20200617082235.GA1523@laureti-dev/
---
 arch/arm/boot/dts/at91-sama5d2_icp.dts |  2 +-
 drivers/net/dsa/microchip/ksz8795.c| 17 +++-
 drivers/net/dsa/microchip/ksz9477.c| 28 +-
 drivers/net/dsa/microchip/ksz_common.c | 13 +++-
 drivers/net/dsa/microchip/ksz_common.h |  3 ++-
 5 files changed, 45 insertions(+), 18 deletions(-)

Changes since v1:
 * Preserve the reverse christmas tree ordering of local variables.
   Reported by David Miller.

Reason for resending v1:
 * While Andrew Lunn agreed to the semantic change, he found the
   implementation unnecessarily complex. He suggested going without a
   per-port interface attribute, but that happened to not work out. The
   information of which port will become the cpu port is only realized
   in a later initialization step.

There were no further replies, so here goes a v2 with minimal changes.

Helmut

diff --git a/arch/arm/boot/dts/at91-sama5d2_icp.dts 
b/arch/arm/boot/dts/at91-sama5d2_icp.dts
index 8d19925fc09e..6783cf16ff81 100644
--- a/arch/arm/boot/dts/at91-sama5d2_icp.dts
+++ b/arch/arm/boot/dts/at91-sama5d2_icp.dts
@@ -116,7 +116,6 @@
switch0: ksz8563@0 {
compatible = "microchip,ksz8563";
reg = <0>;
-   phy-mode = "mii";
reset-gpios = <&pioA PIN_PD4 GPIO_ACTIVE_LOW>;
 
spi-max-frequency = <50>;
@@ -140,6 +139,7 @@
reg = <2>;
label = "cpu";
ethernet = <&macb0>;
+   phy-mode = "mii";
fixed-link {
speed = <100>;
full-duplex;
diff --git a/drivers/net/dsa/microchip/ksz8795.c 
b/drivers/net/dsa/microchip/ksz8795.c
index 8f1d15ea15d9..cae77eafd533 100644
--- a/drivers/net/dsa/microchip/ksz8795.c
+++ b/drivers/net/dsa/microchip/ksz8795.c
@@ -932,11 +932,18 @@ static void ksz8795_port_setup(struct ksz_device *dev, 
int port, bool cpu_port)
ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_802_1P_ENABLE, true);
 
if (cpu_port) {
+   if (!p->interface && dev->compat_interface) {
+   dev_warn(dev->dev,
+"Using legacy switch \"phy-mode\" missing on 
port %d node. Please update your device tree.\n",
+port);
+   p->interface = dev->compat_interface;
+   }
+
/* Configure MII interface for proper network communication. */
ksz_read8(dev, REG_PORT_5_CTRL_6, &data8);
data8 &= ~PORT_INTERFACE_TYPE;
data8 &= ~PORT_GMII_1GPS_MODE;
-   switch (dev->interface) {
+   switch (p->interface) {
case PHY_INTERFACE_MODE_MII:
p->phydev.speed = SPEED_100;
break;
@@ -952,11 +959,11 @@ static void ksz8795_port_setup(struct ksz_device *dev, 
int port, bool cpu_port)
default:
data8 &= ~PORT_RGMII_ID_IN_ENABLE;
data8 &= ~PORT_RGMII_ID_OUT_ENABLE;
-   if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
-   dev->interface == PHY_INTERFACE_MODE_RGMII_RXID)
+   if (p->interface == PHY_INTERFACE_MODE_RGMII_ID ||
+   p->interface == PHY_INTERFACE_MODE_RGMII_RXID)
data8 |= PORT_RGMII_ID_IN_ENABLE;
-   if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
-   dev->interface == PHY_INTERFACE_MODE_RGMII_TXID)
+   if (p->interface == PHY_INTERFACE_MODE_RGMII_ID ||
+   p->interface == PHY_INTERFACE_MODE_RGMII_TXID)
data8 |= PORT_RGMII_ID_OUT_ENABLE;
data8 |= PORT_GMII_1GPS_MODE;
data8 |= PORT_INTERFACE_RGMII;
diff --git a/drivers/net/dsa/microchip/ksz9477.c 
b/drivers/net/dsa/microchip/ksz9477.c
index 3cb22d149813..89e8934bc60b 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -1208,7 +1208,7 @@ static void ksz9477_port_setup(struct ksz_device *dev, 
int port, bool cpu_port)
 
/* configure MAC to 1G & RGMII mode */
ksz_pread8(dev, port, REG_PORT_XMII_CTRL_1, &data8);
-

Re: Exposing device ACL setting through devlink

2020-09-04 Thread Jiri Pirko
Thu, Sep 03, 2020 at 07:59:45PM CEST, tlfal...@linux.ibm.com wrote:
>Hello, I am trying to expose MAC/VLAN ACL and pvid settings for IBM VNIC 
>devices to administrators through devlink (originally through sysfs files, but 
>that was rejected in favor of devlink). Could you give any tips on how you 
>might go about doing this?

Tom, I believe you need to provide more info about what exactly do you
need to setup. But from what you wrote, it seems like you are looking
for bridge/tc offload. The infra is already in place and drivers are
implementing it. See mlxsw for example.


[PATCH net v1 1/2] hinic: bump up the timeout of SET_FUNC_STATE cmd

2020-09-04 Thread Luo bin
We free memory regardless of the return value of SET_FUNC_STATE
cmd in hinic_close function to avoid memory leak and this cmd may
timeout when fw is busy with handling other cmds, so we bump up the
timeout of this cmd to ensure it won't return failure.

Fixes: 00e57a6d4ad3 ("net-next/hinic: Add Tx operation")
Signed-off-by: Luo bin 
---
 .../net/ethernet/huawei/hinic/hinic_hw_mgmt.c| 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
index c6ce5966284c..0d56c6ceccd9 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
@@ -47,6 +47,8 @@
 
 #define MGMT_MSG_TIMEOUT5000
 
+#define SET_FUNC_PORT_MBOX_TIMEOUT 3
+
 #define SET_FUNC_PORT_MGMT_TIMEOUT 25000
 
 #define mgmt_to_pfhwdev(pf_mgmt)\
@@ -361,16 +363,20 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt,
return -EINVAL;
}
 
-   if (cmd == HINIC_PORT_CMD_SET_FUNC_STATE)
-   timeout = SET_FUNC_PORT_MGMT_TIMEOUT;
+   if (HINIC_IS_VF(hwif)) {
+   if (cmd == HINIC_PORT_CMD_SET_FUNC_STATE)
+   timeout = SET_FUNC_PORT_MBOX_TIMEOUT;
 
-   if (HINIC_IS_VF(hwif))
return hinic_mbox_to_pf(pf_to_mgmt->hwdev, mod, cmd, buf_in,
-   in_size, buf_out, out_size, 0);
-   else
+   in_size, buf_out, out_size, timeout);
+   } else {
+   if (cmd == HINIC_PORT_CMD_SET_FUNC_STATE)
+   timeout = SET_FUNC_PORT_MGMT_TIMEOUT;
+
return msg_to_mgmt_sync(pf_to_mgmt, mod, cmd, buf_in, in_size,
buf_out, out_size, MGMT_DIRECT_SEND,
MSG_NOT_RESP, timeout);
+   }
 }
 
 static void recv_mgmt_msg_work_handler(struct work_struct *work)
-- 
2.17.1



[PATCH net v1 0/2] hinic: BugFixes

2020-09-04 Thread Luo bin
The bugs fixed in this patchset have been present since the following
commits:
patch #1: Fixes: 00e57a6d4ad3 ("net-next/hinic: Add Tx operation")
patch #2: Fixes: 5e126e7c4e52 ("hinic: add firmware update support")

Luo bin (2):
  hinic: bump up the timeout of SET_FUNC_STATE cmd
  hinic: bump up the timeout of UPDATE_FW cmd

 .../net/ethernet/huawei/hinic/hinic_hw_mgmt.c | 20 ++-
 1 file changed, 15 insertions(+), 5 deletions(-)

-- 
2.17.1



[PATCH net v1 2/2] hinic: bump up the timeout of UPDATE_FW cmd

2020-09-04 Thread Luo bin
Firmware erases the entire flash region which may take several
seconds before flashing, so we bump up the timeout to ensure this
cmd won't return failure.

Fixes: 5e126e7c4e52 ("hinic: add firmware update support")
Signed-off-by: Luo bin 
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
index 0d56c6ceccd9..2ebae6cb5db5 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
@@ -51,6 +51,8 @@
 
 #define SET_FUNC_PORT_MGMT_TIMEOUT 25000
 
+#define UPDATE_FW_MGMT_TIMEOUT 2
+
 #define mgmt_to_pfhwdev(pf_mgmt)\
container_of(pf_mgmt, struct hinic_pfhwdev, pf_to_mgmt)
 
@@ -372,6 +374,8 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt,
} else {
if (cmd == HINIC_PORT_CMD_SET_FUNC_STATE)
timeout = SET_FUNC_PORT_MGMT_TIMEOUT;
+   else if (cmd == HINIC_PORT_CMD_UPDATE_FW)
+   timeout = UPDATE_FW_MGMT_TIMEOUT;
 
return msg_to_mgmt_sync(pf_to_mgmt, mod, cmd, buf_in, in_size,
buf_out, out_size, MGMT_DIRECT_SEND,
-- 
2.17.1



Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for Octeontx2

2020-09-04 Thread Jiri Pirko
Fri, Sep 04, 2020 at 07:39:54AM CEST, sgout...@marvell.com wrote:
>
>
>> -Original Message-
>> From: Jakub Kicinski 
>> Sent: Friday, September 4, 2020 12:48 AM
>> To: sundeep.l...@gmail.com
>> Cc: da...@davemloft.net; netdev@vger.kernel.org; Sunil Kovvuri Goutham
>> ; Subbaraya Sundeep Bhatta
>> 
>> Subject: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for
>> Octeontx2
>> 
>> External Email
>> 
>> --
>> On Thu,  3 Sep 2020 12:48:16 +0530 sundeep.l...@gmail.com wrote:
>> > From: Subbaraya Sundeep 
>> >
>> > This patchset adds tracepoints support for mailbox.
>> > In Octeontx2, PFs and VFs need to communicate with AF for allocating
>> > and freeing resources. Once all the configuration is done by AF for a
>> > PF/VF then packet I/O can happen on PF/VF queues. When an interface is
>> > brought up many mailbox messages are sent to AF for initializing
>> > queues. Say a VF is brought up then each message is sent to PF and PF
>> > forwards to AF and response also traverses from AF to PF and then VF.
>> > To aid debugging, tracepoints are added at places where messages are
>> > allocated, sent and message interrupts.
>> > Below is the trace of one of the messages from VF to AF and AF
>> > response back to VF:
>> 
>> Could you use the devlink tracepoint? trace_devlink_hwmsg() ?
>
>Thanks for the suggestion.
>In our case the mailbox is central to 3 different drivers and there would be a 
>4th one
>once crypto driver is accepted. We cannot add devlink to all of them inorder 
>to use
>the devlink trace points.

I guess you have 1 pci device, right? Devlink instance is created per
pci device.


>
>Thanks,
>Sunil.


Re: [PATCH net-next 2/3] devlink: Consider other controller while building phys_port_name

2020-09-04 Thread Jiri Pirko
Thu, Sep 03, 2020 at 09:31:23PM CEST, k...@kernel.org wrote:
>On Thu, 3 Sep 2020 07:54:39 +0200 Jiri Pirko wrote:
>> Wed, Sep 02, 2020 at 05:23:58PM CEST, k...@kernel.org wrote:
>> >On Wed, 2 Sep 2020 10:00:11 +0200 Jiri Pirko wrote:  
>>  I didn't quite get the fact that you want to not show controller ID on 
>>  the local
>>  port, initially.
>> >>> Mainly to not_break current users.
>> >> 
>> >> You don't have to take it to the name, unless "external" flag is set.
>> >> 
>> >> But I don't really see the point of showing !external, cause such
>> >> controller number would be always 0. Jakub, why do you think it is
>> >> needed?  
>> >
>> >It may seem reasonable for a smartNIC where there are only two
>> >controllers, and all you really need is that external flag. 
>> >
>> >In a general case when users are trying to figure out the topology
>> >not knowing which controller they are sitting at looks like a serious
>> >limitation.  
>> 
>> I think we misunderstood each other. I never proposed just "external"
>> flag.
>
>Sorry, I was just saying that assuming a single host SmartNIC the
>controller ID is not necessary at all. You never suggested that, I did. 
>Looks like I just confused everyone with that comment :(
>
>Different controller ID for different PFs but the same PCIe link would
>be very wrong. So please clarify - if I have a 2 port smartNIC, with on
>PCIe link to the host, and the embedded controller - what would I see?

Parav?


>
>> What I propose is either:
>> 1) ecnum attribute absent for local
>>ecnum attribute absent set to 0 for external controller X
>>ecnum attribute absent set to 1 for external controller Y
>>...
>> 
>> or:
>> 2) ecnum attribute absent for local, external flag set to false
>>ecnum attribute absent set to 0 for external controller X, external flag 
>> set to true
>>ecnum attribute absent set to 1 for external controller Y, external flag 
>> set to true
>
>I'm saying that I do want to see the the controller ID for all ports.
>
>So:
>
>3) local:   { "controller ID": x }
>   remote1: { "controller ID": y, "external": true }
>   remote1: { "controller ID": z, "external": true }
>
>We don't have to put the controller ID in the name for local ports, but
>the attribute should be reported. AFAIU name was your main concern, no?

Okay. Sounds fine. Let's put the controller number there for all ports.
ctrlnum X external true
ctrlnum Y external false

if (!external)
ignore the ctrlnum when generating the name


>
>> >Example - multi-host system and you want to know which controller you
>> >are to run power cycle from the BMC side.
>> >
>> >We won't be able to change that because it'd change the names for you.  


RE: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for Octeontx2

2020-09-04 Thread Sunil Kovvuri Goutham



> -Original Message-
> From: Jiri Pirko 
> Sent: Friday, September 4, 2020 2:07 PM
> To: Sunil Kovvuri Goutham 
> Cc: Jakub Kicinski ; sundeep.l...@gmail.com;
> da...@davemloft.net; netdev@vger.kernel.org; Subbaraya Sundeep
> Bhatta 
> Subject: Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for
> Octeontx2
> 
> Fri, Sep 04, 2020 at 07:39:54AM CEST, sgout...@marvell.com wrote:
> >
> >
> >> -Original Message-
> >> From: Jakub Kicinski 
> >> Sent: Friday, September 4, 2020 12:48 AM
> >> To: sundeep.l...@gmail.com
> >> Cc: da...@davemloft.net; netdev@vger.kernel.org; Sunil Kovvuri
> >> Goutham ; Subbaraya Sundeep Bhatta
> >> 
> >> Subject: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints
> >> for
> >> Octeontx2
> >>
> >> External Email
> >>
> >> -
> >> - On Thu,  3 Sep 2020 12:48:16 +0530 sundeep.l...@gmail.com wrote:
> >> > From: Subbaraya Sundeep 
> >> >
> >> > This patchset adds tracepoints support for mailbox.
> >> > In Octeontx2, PFs and VFs need to communicate with AF for
> >> > allocating and freeing resources. Once all the configuration is
> >> > done by AF for a PF/VF then packet I/O can happen on PF/VF queues.
> >> > When an interface is brought up many mailbox messages are sent to
> >> > AF for initializing queues. Say a VF is brought up then each
> >> > message is sent to PF and PF forwards to AF and response also traverses
> from AF to PF and then VF.
> >> > To aid debugging, tracepoints are added at places where messages
> >> > are allocated, sent and message interrupts.
> >> > Below is the trace of one of the messages from VF to AF and AF
> >> > response back to VF:
> >>
> >> Could you use the devlink tracepoint? trace_devlink_hwmsg() ?
> >
> >Thanks for the suggestion.
> >In our case the mailbox is central to 3 different drivers and there
> >would be a 4th one once crypto driver is accepted. We cannot add
> >devlink to all of them inorder to use the devlink trace points.
> 
> I guess you have 1 pci device, right? Devlink instance is created per pci
> device.
> 

No, there are 3 drivers registering to 3 PCI device IDs and there can be many
instances of the same devices. So there can be 10's of instances of AF, PF and 
VFs.

Thanks,
Sunil.


Re: [PATCH nf-next v3 3/3] netfilter: Introduce egress hook

2020-09-04 Thread Laura García Liébana
Hi,

On Thu, Sep 3, 2020 at 7:00 AM John Fastabend  wrote:
>
[...]
>
> I don't think it actualy improves performance at least I didn't observe
> that. From the code its not clear why this would be the case either. As
> a nit I would prefer that line removed from the commit message.
>

It hasn't been proven to be untrue either.


[...]
>
> Do you have plans to address the performance degradation? Otherwise
> if I was building some new components its unclear why we would
> choose the slower option over the tc hook. The two suggested
> use cases security policy and DSR sound like new features, any
> reason to not just use existing infrastructure?
>

Unfortunately, tc is not an option as it is required to interact with
nft objects (sets, maps, chains, etc), more complex than just a drop.
Also, when building new features we try to maintain the application
stack as simple as possible, not trying to do ugly integrations.

I understand that you measure performance with a drop, but using this
hook we reduce the datapath consistently for these use cases and
hence, improving traffic performance.

Thank you for your time!


Re: [PATCH net-next RFC v3 01/14] devlink: Add reload action option to devlink reload command

2020-09-04 Thread Jiri Pirko
Thu, Sep 03, 2020 at 09:47:19PM CEST, k...@kernel.org wrote:
>On Thu, 3 Sep 2020 07:57:29 +0200 Jiri Pirko wrote:
>> Wed, Sep 02, 2020 at 05:30:25PM CEST, k...@kernel.org wrote:
>> >On Wed, 2 Sep 2020 11:46:27 +0200 Jiri Pirko wrote:  
>> >> >? Do we need such change there too or keep it as is, each action by 
>> >> >itself
>> >> >and return what was performed ?
>> >> 
>> >> Well, I don't know. User asks for X, X should be performed, not Y or Z.
>> >> So perhaps the return value is not needed.
>> >> Just driver advertizes it supports X, Y, Z and the users says:
>> >> 1) do X, driver does X
>> >> 2) do Y, driver does Y
>> >> 3) do Z, driver does Z
>> >> [
>> >> I think this kindof circles back to the original proposal...  
>> >
>> >Why? User does not care if you activate new devlink params when
>> >activating new firmware. Trust me. So why make the user figure out
>> >which of all possible reset option they should select? If there is 
>> >a legitimate use case to limit what is reset - it should be handled
>> >by a separate negative attribute, like --live which says don't reset
>> >anything.  
>> 
>> I see. Okay. Could you please sum-up the interface as you propose it?
>
>What I proposed on v1, pass requested actions as a bitfield, driver may
>perform more actions, we can return performed actions in the response.

Okay. So for example for mlxsw, user might say:
1) I want driver reinit
kernel reports: fw reset and driver reinit was done
2) I want fw reset
kernel reports: fw reset and driver reinit was done
3) I want fw reset and driver reinit
kernel reports: fw reset and driver reinit was done

>
>Then separate attribute to carry constraints for the request, like
>--live.

Hmm, this is a bit unclear how it is supposed to work. The constraints
apply for all? I mean, the actions are requested by a bitfield.
So the user can say:
I want fw reset and driver reinit --live. "--live" applies to both fw
reset and driver reinit? That is odd.

>
>I'd think the supported actions in devlink_ops would be fine as a
>bitfield, too. Combinations are often hard to capture in static data.


Re: [PATCHv2] selftests: rtnetlink: load fou module for kci_test_encap_fou()

2020-09-04 Thread Po-Hsu Lin
Hello David,

do you need more information for this V2 patch?

Thank you
PHLin

On Mon, Aug 17, 2020 at 10:53 AM Po-Hsu Lin  wrote:
>
> The kci_test_encap_fou() test from kci_test_encap() in rtnetlink.sh
> needs the fou module to work. Otherwise it will fail with:
>
>   $ ip netns exec "$testns" ip fou add port  ipproto 47
>   RTNETLINK answers: No such file or directory
>   Error talking to the kernel
>
> Add the CONFIG_NET_FOU into the config file as well. Which needs at
> least to be set as a loadable module.
>
> Signed-off-by: Po-Hsu Lin 
> ---
>  tools/testing/selftests/net/config   | 1 +
>  tools/testing/selftests/net/rtnetlink.sh | 6 ++
>  2 files changed, 7 insertions(+)
>
> diff --git a/tools/testing/selftests/net/config 
> b/tools/testing/selftests/net/config
> index 3b42c06b..96d2763 100644
> --- a/tools/testing/selftests/net/config
> +++ b/tools/testing/selftests/net/config
> @@ -31,3 +31,4 @@ CONFIG_NET_SCH_ETF=m
>  CONFIG_NET_SCH_NETEM=y
>  CONFIG_TEST_BLACKHOLE_DEV=m
>  CONFIG_KALLSYMS=y
> +CONFIG_NET_FOU=m
> diff --git a/tools/testing/selftests/net/rtnetlink.sh 
> b/tools/testing/selftests/net/rtnetlink.sh
> index bdbf4b3..7931b65 100755
> --- a/tools/testing/selftests/net/rtnetlink.sh
> +++ b/tools/testing/selftests/net/rtnetlink.sh
> @@ -521,6 +521,11 @@ kci_test_encap_fou()
> return $ksft_skip
> fi
>
> +   if ! /sbin/modprobe -q -n fou; then
> +   echo "SKIP: module fou is not found"
> +   return $ksft_skip
> +   fi
> +   /sbin/modprobe -q fou
> ip -netns "$testns" fou add port  ipproto 47 2>/dev/null
> if [ $? -ne 0 ];then
> echo "FAIL: can't add fou port , skipping test"
> @@ -541,6 +546,7 @@ kci_test_encap_fou()
> return 1
> fi
>
> +   /sbin/modprobe -q -r fou
> echo "PASS: fou"
>  }
>
> --
> 2.7.4
>


Re: KASAN: use-after-free Read in dump_schedule

2020-09-04 Thread syzbot
syzbot has bisected this issue to:

commit 7b9eba7ba0c1b24df42b70b62d154b284befbccf
Author: Leandro Dorileo 
Date:   Mon Apr 8 17:12:17 2019 +

net/sched: taprio: fix picos_per_byte miscalculation

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=15464af990
start commit:   fc3abb53 Merge branch 'for-linus' of git://git.kernel.org/..
git tree:   upstream
final oops: https://syzkaller.appspot.com/x/report.txt?x=17464af990
console output: https://syzkaller.appspot.com/x/log.txt?x=13464af990
kernel config:  https://syzkaller.appspot.com/x/.config?x=e1c560d0f4e121c9
dashboard link: https://syzkaller.appspot.com/bug?extid=621fd33c0b53d15ee8de
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1129d0e990
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17fb6a2590

Reported-by: syzbot+621fd33c0b53d15ee...@syzkaller.appspotmail.com
Fixes: 7b9eba7ba0c1 ("net/sched: taprio: fix picos_per_byte miscalculation")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection


[PATCH v14 00/10] Enable ptp_kvm for arm64

2020-09-04 Thread Jianyong Wu
Currently, we offen use ntp (sync time with remote network clock)
to sync time in VM. But the precision of ntp is subject to network delay
so it's difficult to sync time in a high precision.

kvm virtual ptp clock (ptp_kvm) offers another way to sync time in VM,
as the remote clock locates in the host instead of remote network clock.
It targets to sync time between guest and host in virtualization
environment and in this way, we can keep the time of all the VMs running
in the same host in sync. In general, the delay of communication between
host and guest is quiet small, so ptp_kvm can offer time sync precision
up to in order of nanosecond. Please keep in mind that ptp_kvm just
limits itself to be a channel which transmit the remote clock from
host to guest and leaves the time sync jobs to an application, eg. chrony,
in usersapce in VM.

How ptp_kvm works:
After ptp_kvm initialized, there will be a new device node under
/dev called ptp%d. A guest userspace service, like chrony, can use this
device to get host walltime, sometimes also counter cycle, which depends
on the service it calls. Then this guest userspace service can use those
data to do the time sync for guest.
here is a rough sketch to show how kvm ptp clock works.

||  |--|
|   guest userspace  |  |  host|
|ioctl -> /dev/ptp%d |  |  |
|   ^   ||  |  |
||  |  |
|   |   | guest kernel   |  |  |
|   |   V  (get host walltime/counter cycle)   |
|  ptp_kvm -> hypercall - - - - - - - - - - ->hypercall service|
| <- - - - - - - - - - - - |
||  |--|

1. time sync service in guest userspace call ptp device through /dev/ptp%d.
2. ptp_kvm module in guest recive this request then invoke hypercall to route
into host kernel to request host walltime/counter cycle.
3. ptp_kvm hypercall service in host response to the request and send data back.
4. ptp (not ptp_kvm) in guest copy the data to userspace.

This ptp_kvm implementation focuses itself to step 2 and 3 and step 2 works
in guest comparing step 3 works in host kernel.

change log:
from v13 to v14
(1) rebase code on 5.9-rc3.
(2) add a document to introduce implementation of PTP_KVM on
arm64.
(3) fix comments issue in hypercall.c.
(4) export arm_smccc_1_1_get_conduit using EXPORT_SYMBOL_GPL.
(5) fix make issue on x86 reported by kernel test robot.

from v12 to v13:
(1) rebase code on 5.8-rc1.
(2) this patch set base on 2 patches of 1/8 and 2/8 from Will Decon.
(3) remove the change to ptp device code of extend getcrosststamp.
(4) remove the mechanism of letting user choose the counter type in
ptp_kvm for arm64.
(5) add virtual counter option in ptp_kvm service to let user choose
the specific counter explicitly.

from v11 to v12:
(1) rebase code on 5.7-rc6 and rebase 2 patches from Will Decon
including 1/11 and 2/11. as these patches introduce discover mechanism of
vendor smccc service.
(2) rebase ptp_kvm hypercall service from standard smccc to vendor
smccc and add ptp_kvm to vendor smccc service discover mechanism.
(3) add detail of why we need ptp_kvm and how ptp_kvm works in cover
letter.
from v10 to v11:
(1) rebase code on 5.7-rc2.
(2) remove support for arm32, as kvm support for arm32 will be
removed [1]
(3) add error report in ptp_kvm initialization.

from v9 to v10:
(1) change code base to v5.5.
(2) enable ptp_kvm both for arm32 and arm64.
(3) let user choose which of virtual counter or physical counter
should return when using crosstimestamp mode of ptp_kvm for arm/arm64.
(4) extend input argument for getcrosstimestamp API.

from v8 to v9:
(1) move ptp_kvm.h to driver/ptp/
(2) replace license declaration of ptp_kvm.h the same with other
header files in the same directory.

from v7 to v8:
(1) separate adding clocksource id for arm_arch_counter as a
single patch.
(2) update commit message for patch 4/8.
(3) refine patch 7/8 and patch 8/8 to make them more independent.

from v5 to v6:
(1) apply Mark's patch[4] to get SMCCC conduit.
(2) add mechanism to recognize current clocksource by add
clocksouce_id value into struct clocksource instead of method in patch-v5.
(3) rename kvm_arch_ptp_get_clock_fn into
kvm_arch_ptp_get_crosststamp.

from v4 to v5:
(1) remove hvc delay compensasion as it should leave to userspace.
(2) check current clocksource in hvc call service.
(3) expose current clocksource by adding it to
system_tim

[PATCH v14 01/10] arm64: Probe for the presence of KVM hypervisor services during boot

2020-09-04 Thread Jianyong Wu
From: Will Deacon 

Although the SMCCC specification provides some limited functionality for
describing the presence of hypervisor and firmware services, this is
generally applicable only to functions designated as "Arm Architecture
Service Functions" and no portable discovery mechanism is provided for
standard hypervisor services, despite having a designated range of
function identifiers reserved by the specification.

In an attempt to avoid the need for additional firmware changes every
time a new function is added, introduce a UID to identify the service
provider as being compatible with KVM. Once this has been established,
additional services can be discovered via a feature bitmap.

Cc: Marc Zyngier 
Signed-off-by: Will Deacon 
Signed-off-by: Jianyong Wu 
---
 arch/arm64/include/asm/hypervisor.h | 11 +
 arch/arm64/kernel/setup.c   | 36 +
 include/linux/arm-smccc.h   | 26 +
 3 files changed, 73 insertions(+)

diff --git a/arch/arm64/include/asm/hypervisor.h 
b/arch/arm64/include/asm/hypervisor.h
index f9cc1d021791..91e4bd890819 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -2,6 +2,17 @@
 #ifndef _ASM_ARM64_HYPERVISOR_H
 #define _ASM_ARM64_HYPERVISOR_H
 
+#include 
 #include 
 
+static inline bool kvm_arm_hyp_service_available(u32 func_id)
+{
+   extern DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS);
+
+   if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS)
+   return -EINVAL;
+
+   return test_bit(func_id, __kvm_arm_hyp_services);
+}
+
 #endif
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c4c9bad1b8..cb4a18fe5ad4 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -276,6 +277,40 @@ arch_initcall(reserve_memblock_reserved_regions);
 
 u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
 
+DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) = { };
+
+static void __init kvm_init_hyp_services(void)
+{
+   int i;
+   struct arm_smccc_res res;
+
+   if (arm_smccc_get_version() == ARM_SMCCC_VERSION_1_0)
+   return;
+
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res);
+   if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 ||
+   res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 ||
+   res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 ||
+   res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3)
+   return;
+
+   memset(&res, 0, sizeof(res));
+   arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res);
+   for (i = 0; i < 32; ++i) {
+   if (res.a0 & (i))
+   set_bit(i + (32 * 0), __kvm_arm_hyp_services);
+   if (res.a1 & (i))
+   set_bit(i + (32 * 1), __kvm_arm_hyp_services);
+   if (res.a2 & (i))
+   set_bit(i + (32 * 2), __kvm_arm_hyp_services);
+   if (res.a3 & (i))
+   set_bit(i + (32 * 3), __kvm_arm_hyp_services);
+   }
+
+   pr_info("KVM hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 
0x%08lx)\n",
+res.a3, res.a2, res.a1, res.a0);
+}
+
 u64 cpu_logical_map(int cpu)
 {
return __cpu_logical_map[cpu];
@@ -354,6 +389,7 @@ void __init __no_sanitize_address setup_arch(char 
**cmdline_p)
else
psci_acpi_init();
 
+   kvm_init_hyp_services();
init_bootcpu_ops();
smp_init_cpus();
smp_build_mpidr_hash();
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 15c706fb0a37..f7b5dd7dbf9f 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -49,11 +49,14 @@
 #define ARM_SMCCC_OWNER_OEM3
 #define ARM_SMCCC_OWNER_STANDARD   4
 #define ARM_SMCCC_OWNER_STANDARD_HYP   5
+#define ARM_SMCCC_OWNER_VENDOR_HYP 6
 #define ARM_SMCCC_OWNER_TRUSTED_APP48
 #define ARM_SMCCC_OWNER_TRUSTED_APP_END49
 #define ARM_SMCCC_OWNER_TRUSTED_OS 50
 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63
 
+#define ARM_SMCCC_FUNC_QUERY_CALL_UID  0xff01
+
 #define ARM_SMCCC_QUIRK_NONE   0
 #define ARM_SMCCC_QUIRK_QCOM_A61 /* Save/restore register a6 */
 
@@ -86,6 +89,29 @@
   ARM_SMCCC_SMC_32,\
   0, 0x7fff)
 
+#define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID  \
+   ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+  ARM_SMCCC_SMC_32,\
+  ARM_SMCCC_OWNER_VENDOR_HYP,  \
+  ARM_SMCCC_FUNC_QUERY_CALL_UID)
+
+/* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
+#define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U
+#defi

[PATCH v14 02/10] arm/arm64: KVM: Advertise KVM UID to guests via SMCCC

2020-09-04 Thread Jianyong Wu
From: Will Deacon 

We can advertise ourselves to guests as KVM and provide a basic features
bitmap for discoverability of future hypervisor services.

Cc: Marc Zyngier 
Signed-off-by: Will Deacon 
Signed-off-by: Jianyong Wu 
---
 arch/arm64/kvm/hypercalls.c | 29 +++--
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 550dfa3e53cd..901c60f119c2 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -12,13 +12,13 @@
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
u32 func_id = smccc_get_function(vcpu);
-   long val = SMCCC_RET_NOT_SUPPORTED;
+   u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
u32 feature;
gpa_t gpa;
 
switch (func_id) {
case ARM_SMCCC_VERSION_FUNC_ID:
-   val = ARM_SMCCC_VERSION_1_1;
+   val[0] = ARM_SMCCC_VERSION_1_1;
break;
case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
feature = smccc_get_arg1(vcpu);
@@ -28,10 +28,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case KVM_BP_HARDEN_UNKNOWN:
break;
case KVM_BP_HARDEN_WA_NEEDED:
-   val = SMCCC_RET_SUCCESS;
+   val[0] = SMCCC_RET_SUCCESS;
break;
case KVM_BP_HARDEN_NOT_REQUIRED:
-   val = SMCCC_RET_NOT_REQUIRED;
+   val[0] = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
@@ -41,31 +41,40 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
case KVM_SSBD_UNKNOWN:
break;
case KVM_SSBD_KERNEL:
-   val = SMCCC_RET_SUCCESS;
+   val[0] = SMCCC_RET_SUCCESS;
break;
case KVM_SSBD_FORCE_ENABLE:
case KVM_SSBD_MITIGATED:
-   val = SMCCC_RET_NOT_REQUIRED;
+   val[0] = SMCCC_RET_NOT_REQUIRED;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
-   val = SMCCC_RET_SUCCESS;
+   val[0] = SMCCC_RET_SUCCESS;
break;
}
break;
case ARM_SMCCC_HV_PV_TIME_FEATURES:
-   val = kvm_hypercall_pv_features(vcpu);
+   val[0] = kvm_hypercall_pv_features(vcpu);
break;
case ARM_SMCCC_HV_PV_TIME_ST:
gpa = kvm_init_stolen_time(vcpu);
if (gpa != GPA_INVALID)
-   val = gpa;
+   val[0] = gpa;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+   val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+   val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+   val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+   val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+   break;
+   case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+   val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
break;
default:
return kvm_psci_call(vcpu);
}
 
-   smccc_set_retval(vcpu, val, 0, 0, 0);
+   smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
return 1;
 }
-- 
2.17.1



[PATCH v14 07/10] arm64/kvm: Add hypercall service for kvm ptp.

2020-09-04 Thread Jianyong Wu
ptp_kvm will get this service through smccc call.
The service offers wall time and counter cycle of host for guest.
caller must explicitly determines which cycle of virtual counter or
physical counter to return if it needs counter cycle.

Signed-off-by: Jianyong Wu 
---
 arch/arm64/kvm/Kconfig   |  6 +
 arch/arm64/kvm/arch_timer.c  |  2 +-
 arch/arm64/kvm/hypercalls.c  | 49 
 include/kvm/arm_arch_timer.h |  1 +
 include/linux/arm-smccc.h| 16 
 5 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 318c8f2df245..bbdfacec4813 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -60,6 +60,12 @@ config KVM_ARM_PMU
 config KVM_INDIRECT_VECTORS
def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE
 
+config ARM64_KVM_PTP_HOST
+   bool "KVM PTP clock host service for arm64"
+   default y
+   help
+ virtual kvm ptp clock hypercall service for arm64
+
 endif # KVM
 
 endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 32ba6fbc3814..eb85f6701845 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -81,7 +81,7 @@ u64 timer_get_cval(struct arch_timer_context *ctxt)
}
 }
 
-static u64 timer_get_offset(struct arch_timer_context *ctxt)
+u64 timer_get_offset(struct arch_timer_context *ctxt)
 {
struct kvm_vcpu *vcpu = ctxt->vcpu;
 
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 901c60f119c2..2628ddc13abd 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -11,6 +12,10 @@
 
 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 {
+#ifdef CONFIG_ARM64_KVM_PTP_HOST
+   struct system_time_snapshot systime_snapshot;
+   u64 cycles = -1;
+#endif
u32 func_id = smccc_get_function(vcpu);
u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
u32 feature;
@@ -21,6 +26,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
val[0] = ARM_SMCCC_VERSION_1_1;
break;
case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+   /*
+* Note: keep in mind that feature is u32 and smccc_get_arg1
+* will return u64, so need auto cast here.
+*/
feature = smccc_get_arg1(vcpu);
switch (feature) {
case ARM_SMCCC_ARCH_WORKAROUND_1:
@@ -70,7 +79,47 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+#ifdef CONFIG_ARM64_KVM_PTP_HOST
+   val[0] |= BIT(ARM_SMCCC_KVM_FUNC_KVM_PTP);
+#endif
break;
+#ifdef CONFIG_ARM64_KVM_PTP_HOST
+   /*
+* This serves virtual kvm_ptp.
+* Four values will be passed back.
+* reg0 stores high 32-bit host ktime;
+* reg1 stores low 32-bit host ktime;
+* reg2 stores high 32-bit difference of host cycles and cntvoff;
+* reg3 stores low 32-bit difference of host cycles and cntvoff.
+*/
+   case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+   /*
+* system time and counter value must captured in the same
+* time to keep consistency and precision.
+*/
+   ktime_get_snapshot(&systime_snapshot);
+   if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
+   break;
+   val[0] = systime_snapshot.real;
+   /*
+* which of virtual counter or physical counter being
+* asked for is decided by the r1 value of smccc
+* call. If no invalid r1 value offered, default cycle
+* value(-1) will return.
+*/
+   feature = smccc_get_arg1(vcpu);
+   switch (feature) {
+   case ARM_PTP_VIRT_COUNTER:
+   cycles = systime_snapshot.cycles -
+vcpu_read_sys_reg(vcpu, CNTVOFF_EL2);
+   break;
+   case ARM_PTP_PHY_COUNTER:
+   cycles = systime_snapshot.cycles;
+   break;
+   }
+   val[1] = cycles;
+   break;
+#endif
default:
return kvm_psci_call(vcpu);
}
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 51c19381108c..5a2b6da9be7a 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -105,5 +105,6 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
 /* Needed for tracing */
 u32 timer_get_ctl(struct arch_timer_context *ctxt);
 u64 timer_get_cval(struct arch_timer_context *ctxt);
+u64 timer_get_offset(struct arch_timer_context *ctxt);
 
 #endif
diff --git a

[PATCH v14 04/10] ptp: Reorganize ptp_kvm module to make it arch-independent.

2020-09-04 Thread Jianyong Wu
Currently, ptp_kvm modules implementation is only for x86 which includs
large part of arch-specific code.  This patch move all of those code
into new arch related file in the same directory.

Signed-off-by: Jianyong Wu 
---
 drivers/ptp/Makefile|  5 ++
 drivers/ptp/ptp_kvm.h   | 11 +++
 drivers/ptp/{ptp_kvm.c => ptp_kvm_common.c} | 80 +-
 drivers/ptp/ptp_kvm_x86.c   | 89 +
 4 files changed, 126 insertions(+), 59 deletions(-)
 create mode 100644 drivers/ptp/ptp_kvm.h
 rename drivers/ptp/{ptp_kvm.c => ptp_kvm_common.c} (63%)
 create mode 100644 drivers/ptp/ptp_kvm_x86.c

diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 7aff75f745dc..192bc5e78a78 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -3,7 +3,12 @@
 # Makefile for PTP 1588 clock support.
 #
 
+ifeq ($(ARCH), x86_64)
+   ARCH=x86
+endif
+
 ptp-y  := ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-y  := ptp_kvm_$(ARCH).o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)   += ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)   += ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_INES)  += ptp_ines.o
diff --git a/drivers/ptp/ptp_kvm.h b/drivers/ptp/ptp_kvm.h
new file mode 100644
index ..4bf1802bbeb8
--- /dev/null
+++ b/drivers/ptp/ptp_kvm.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Virtual PTP 1588 clock for use with KVM guests
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+int kvm_arch_ptp_init(void);
+int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+int kvm_arch_ptp_get_crosststamp(unsigned long *cycle,
+   struct timespec64 *tspec, void *cs);
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm_common.c
similarity index 63%
rename from drivers/ptp/ptp_kvm.c
rename to drivers/ptp/ptp_kvm_common.c
index 658d33fc3195..8d8a9bcd1d22 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -8,15 +8,16 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-#include 
-#include 
 #include 
 
 #include 
 
+#include "ptp_kvm.h"
+
 struct kvm_ptp_clock {
struct ptp_clock *ptp_clock;
struct ptp_clock_info caps;
@@ -24,56 +25,29 @@ struct kvm_ptp_clock {
 
 static DEFINE_SPINLOCK(kvm_ptp_lock);
 
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
 static int ptp_kvm_get_time_fn(ktime_t *device_time,
   struct system_counterval_t *system_counter,
   void *ctx)
 {
-   unsigned long ret;
+   unsigned long ret, cycle;
struct timespec64 tspec;
-   unsigned version;
-   int cpu;
-   struct pvclock_vcpu_time_info *src;
+   struct clocksource *cs;
 
spin_lock(&kvm_ptp_lock);
 
preempt_disable_notrace();
-   cpu = smp_processor_id();
-   src = &hv_clock[cpu].pvti;
-
-   do {
-   /*
-* We are using a TSC value read in the hosts
-* kvm_hc_clock_pairing handling.
-* So any changes to tsc_to_system_mul
-* and tsc_shift or any other pvclock
-* data invalidate that measurement.
-*/
-   version = pvclock_read_begin(src);
-
-   ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-clock_pair_gpa,
-KVM_CLOCK_PAIRING_WALLCLOCK);
-   if (ret != 0) {
-   pr_err_ratelimited("clock pairing hypercall ret %lu\n", 
ret);
-   spin_unlock(&kvm_ptp_lock);
-   preempt_enable_notrace();
-   return -EOPNOTSUPP;
-   }
-
-   tspec.tv_sec = clock_pair.sec;
-   tspec.tv_nsec = clock_pair.nsec;
-   ret = __pvclock_read_cycles(src, clock_pair.tsc);
-   } while (pvclock_read_retry(src, version));
+   ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs);
+   if (ret != 0) {
+   pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+   spin_unlock(&kvm_ptp_lock);
+   preempt_enable_notrace();
+   return -EOPNOTSUPP;
+   }
 
preempt_enable_notrace();
 
-   system_counter->cycles = ret;
-   system_counter->cs = &kvm_clock;
+   system_counter->cycles = cycle;
+   system_counter->cs = cs;
 
*device_time = timespec64_to_ktime(tspec);
 
@@ -116,17 +90,13 @@ static int ptp_kvm_gettime(struct ptp_clock_info *ptp, 
struct timespec64 *ts)
 
spin_lock(&kvm_ptp_lock);
 
-   ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-clock_pair_gpa,
-KVM_CLOCK_PAIRING_WALLCLOCK);
+   ret = kvm_arch_ptp_get_clock(&tspec);
if (r

[PATCH v14 06/10] clocksource: Add clocksource id for arm arch counter

2020-09-04 Thread Jianyong Wu
Add clocksource id for arm arch counter to let it be identified easily and
elegantly in ptp_kvm implementation for arm.

Signed-off-by: Jianyong Wu 
---
 drivers/clocksource/arm_arch_timer.c | 2 ++
 include/linux/clocksource_ids.h  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index 6c3e84180146..d55acffb0b90 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -191,6 +192,7 @@ static u64 arch_counter_read_cc(const struct cyclecounter 
*cc)
 
 static struct clocksource clocksource_counter = {
.name   = "arch_sys_counter",
+   .id = CSID_ARM_ARCH_COUNTER,
.rating = 400,
.read   = arch_counter_read,
.mask   = CLOCKSOURCE_MASK(56),
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
index 4d8e19e05328..16775d7d8f8d 100644
--- a/include/linux/clocksource_ids.h
+++ b/include/linux/clocksource_ids.h
@@ -5,6 +5,7 @@
 /* Enum to give clocksources a unique identifier */
 enum clocksource_ids {
CSID_GENERIC= 0,
+   CSID_ARM_ARCH_COUNTER,
CSID_MAX,
 };
 
-- 
2.17.1



[PATCH v14 03/10] smccc: Export smccc conduit get helper.

2020-09-04 Thread Jianyong Wu
Export arm_smccc_1_1_get_conduit then modules can use smccc helper which
adopts it.

Acked-by: Mark Rutland 
Signed-off-by: Jianyong Wu 
---
 drivers/firmware/smccc/smccc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/firmware/smccc/smccc.c b/drivers/firmware/smccc/smccc.c
index 4e80921ee212..d26d3512b145 100644
--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -24,6 +24,7 @@ enum arm_smccc_conduit arm_smccc_1_1_get_conduit(void)
 
return smccc_conduit;
 }
+EXPORT_SYMBOL_GPL(arm_smccc_1_1_get_conduit);
 
 u32 arm_smccc_get_version(void)
 {
-- 
2.17.1



[PATCH v14 10/10] arm64: Add kvm capability check extension for ptp_kvm

2020-09-04 Thread Jianyong Wu
Let userspace check if there is kvm ptp service in host.
Before VMs migrate to another host, VMM may check if this
cap is available to determine the next behavior.

Signed-off-by: Jianyong Wu 
Suggested-by: Marc Zyngier 
---
 arch/arm64/kvm/arm.c | 4 
 include/uapi/linux/kvm.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 691d21e4c717..8e99ad2f0b83 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -178,6 +178,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
case KVM_CAP_ARM_NISV_TO_USER:
case KVM_CAP_ARM_INJECT_EXT_DABT:
+
+#ifdef CONFIG_ARM64_KVM_PTP_HOST
+   case KVM_CAP_ARM_PTP_KVM:
+#endif
r = 1;
break;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f6d86033c4fa..dd58ebe0daf5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1035,6 +1035,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_LAST_CPU 184
 #define KVM_CAP_SMALLER_MAXPHYADDR 185
 #define KVM_CAP_S390_DIAG318 186
+#define KVM_CAP_ARM_PTP_KVM 187
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.17.1



[PATCH v14 08/10] ptp: arm64: Enable ptp_kvm for arm64

2020-09-04 Thread Jianyong Wu
Currently, there is no mechanism to keep time sync between guest and host
in arm64 virtualization environment. Time in guest will drift compared
with host after boot up as they may both use third party time sources
to correct their time respectively. The time deviation will be in order
of milliseconds. But in some scenarios,like in cloud envirenment, we ask
for higher time precision.

kvm ptp clock, which choose the host clock source as a reference
clock to sync time between guest and host, has been adopted by x86
which makes the time sync order from milliseconds to nanoseconds.

This patch enables kvm ptp clock for arm64 and improve clock sync precison
significantly.

Test result comparisons between with kvm ptp clock and without it in arm64
are as follows. This test derived from the result of command 'chronyc
sources'. we should take more care of the last sample column which shows
the offset between the local clock and the source at the last measurement.

no kvm ptp in guest:
MS Name/IP address   Stratum Poll Reach LastRx Last sample

^* dns1.synet.edu.cn  2   6   37713  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37721  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37729  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37737  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37745  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37753  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37761  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn  2   6   377 4   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37712   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn  2   6   37720   -130us[ +796us] +/-   21ms

in host:
MS Name/IP address   Stratum Poll Reach LastRx Last sample

^* 120.25.115.20  2   7   37772   -470us[ -603us] +/-   18ms
^* 120.25.115.20  2   7   37792   -470us[ -603us] +/-   18ms
^* 120.25.115.20  2   7   377   112   -470us[ -603us] +/-   18ms
^* 120.25.115.20  2   7   377 2   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   37722   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   37743   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   37763   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   37783   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   377   103   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20  2   7   377   123   +872ns[-6808ns] +/-   17ms

The dns1.synet.edu.cn is the network reference clock for guest and
120.25.115.20 is the network reference clock for host. we can't get the
clock error between guest and host directly, but a roughly estimated value
will be in order of hundreds of us to ms.

with kvm ptp in guest:
chrony has been disabled in host to remove the disturb by network clock.

MS Name/IP address Stratum Poll Reach LastRx Last sample

* PHC00   3   377 8 -7ns[   +1ns] +/-3ns
* PHC00   3   377 8 +1ns[  +16ns] +/-3ns
* PHC00   3   377 6 -4ns[   -0ns] +/-6ns
* PHC00   3   377 6 -8ns[  -12ns] +/-5ns
* PHC00   3   377 5 +2ns[   +4ns] +/-4ns
* PHC00   3   37713 +2ns[   +4ns] +/-4ns
* PHC00   3   37712 -4ns[   -6ns] +/-4ns
* PHC00   3   37711 -8ns[  -11ns] +/-6ns
* PHC00   3   37710-14ns[  -20ns] +/-4ns
* PHC00   3   377 8 +4ns[   +5ns] +/-4ns

The PHC0 is the ptp clock which choose the host clock as its source
clock. So we can see that the clock difference between host and guest
is in order of ns.

Signed-off-by: Jianyong Wu 
---
 drivers/clocksource/arm_arch_timer.c | 24 +
 drivers/ptp/Kconfig  |  2 +-
 drivers/ptp/ptp_kvm_arm64.c  | 53 
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ptp/ptp_kvm_arm64.c

diff --git a/drivers/clocksource/arm_arch_timer.c 
b/drivers/clocksource/arm_arch_timer.c
index d55acffb0b90..aaf286e90092 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -1650,3 +1650,27 @@ static int __init arch_timer_acpi_init(struct 
acpi_table_header *table)
 }
 TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
 #endif
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK_KVM)
+#include 
+int kvm_arch_ptp_get_crosststamp(unsigned long *cycle, struct timespec64 *ts,
+ struct clocksource **cs)
+

[PATCH bpf-next] bpf: don't check against device MTU in __bpf_skb_max_len

2020-09-04 Thread Jesper Dangaard Brouer
Multiple BPF-helpers that can manipulate/increase the size of the SKB uses
__bpf_skb_max_len() as the max-length. This function limit size against the
current net_device MTU (skb->dev->mtu).

Often packets gets redirected to another net_device, that can have a larger
MTU, and this is the MTU that should count. The MTU limiting at this stage
seems wrong and redundant as the netstack will handle MTU checking
elsewhere.

Redirecting into sockmap by sk_skb programs already skip this MTU check.
Keep what commit 0c6bc6e531a6 ("bpf: fix sk_skb programs without skb->dev
assigned") did, and limit the max_len to SKB_MAX_ALLOC.

Also notice that the max_len MTU check is already skipped for GRO SKBs
(skb_is_gso), in both bpf_skb_adjust_room() and bpf_skb_change_head().
Thus, it is clearly safe to remove this check.

Signed-off-by: Jesper Dangaard Brouer 
---
 net/core/filter.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 47eef9a0be6a..ec0ed107fa37 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3211,8 +3211,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 
off, u32 len_diff,
 
 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 {
-   return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- SKB_MAX_ALLOC;
+   return SKB_MAX_ALLOC;
 }
 
 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,




[PATCH v14 05/10] time: Add mechanism to recognize clocksource in time_get_snapshot

2020-09-04 Thread Jianyong Wu
From: Thomas Gleixner 

System time snapshots are not conveying information about the current
clocksource which was used, but callers like the PTP KVM guest
implementation have the requirement to evaluate the clocksource type to
select the appropriate mechanism.

Introduce a clocksource id field in struct clocksource which is by default
set to CSID_GENERIC (0). Clocksource implementations can set that field to
a value which allows to identify the clocksource.

Store the clocksource id of the current clocksource in the
system_time_snapshot so callers can evaluate which clocksource was used to
take the snapshot and act accordingly.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Jianyong Wu 
---
 include/linux/clocksource.h |  6 ++
 include/linux/clocksource_ids.h | 11 +++
 include/linux/timekeeping.h | 12 +++-
 kernel/time/clocksource.c   |  2 ++
 kernel/time/timekeeping.c   |  1 +
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/clocksource_ids.h

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 86d143db6523..1290d0dce840 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -62,6 +63,10 @@ struct module;
  * 400-499: Perfect
  * The ideal clocksource. A must-use where
  * available.
+ * @id:Defaults to CSID_GENERIC. The id value is 
captured
+ * in certain snapshot functions to allow callers to
+ * validate the clocksource from which the snapshot was
+ * taken.
  * @flags: Flags describing special properties
  * @enable:Optional function to enable the clocksource
  * @disable:   Optional function to disable the clocksource
@@ -100,6 +105,7 @@ struct clocksource {
const char  *name;
struct list_headlist;
int rating;
+   enum clocksource_idsid;
enum vdso_clock_modevdso_clock_mode;
unsigned long   flags;
 
diff --git a/include/linux/clocksource_ids.h b/include/linux/clocksource_ids.h
new file mode 100644
index ..4d8e19e05328
--- /dev/null
+++ b/include/linux/clocksource_ids.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CLOCKSOURCE_IDS_H
+#define _LINUX_CLOCKSOURCE_IDS_H
+
+/* Enum to give clocksources a unique identifier */
+enum clocksource_ids {
+   CSID_GENERIC= 0,
+   CSID_MAX,
+};
+
+#endif
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index d5471d6fa778..17be3b6f9f37 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -3,6 +3,7 @@
 #define _LINUX_TIMEKEEPING_H
 
 #include 
+#include 
 
 /* Included from linux/ktime.h */
 
@@ -232,11 +233,12 @@ extern void timekeeping_inject_sleeptime64(const struct 
timespec64 *delta);
  * @cs_was_changed_seq:The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-   u64 cycles;
-   ktime_t real;
-   ktime_t raw;
-   unsigned intclock_was_set_seq;
-   u8  cs_was_changed_seq;
+   u64 cycles;
+   ktime_t real;
+   ktime_t raw;
+   enum clocksource_idscs_id;
+   unsigned intclock_was_set_seq;
+   u8  cs_was_changed_seq;
 };
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 02441ead3c3b..6b38d4993214 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -928,6 +928,8 @@ int __clocksource_register_scale(struct clocksource *cs, 
u32 scale, u32 freq)
 
clocksource_arch_init(cs);
 
+   if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
+   cs->id = CSID_GENERIC;
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. 
Disabling VDSO support.\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c47f388a83f..c15dd42241bb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -982,6 +982,7 @@ void ktime_get_snapshot(struct system_time_snapshot 
*systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
now = tk_clock_read(&tk->tkr_mono);
+   systime_snapshot->cs_id = tk->tkr_mono.clock->id;
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
-- 
2.17.1



[PATCH v14 09/10] doc: add ptp_kvm introduction for arm64 support

2020-09-04 Thread Jianyong Wu
ptp_kvm implementation depends on hypercall using SMCCC. So we
introduce a new SMCCC service ID. This doc explain how we define
and use this new ID.

Signed-off-by: Jianyong Wu 
---
 Documentation/virt/kvm/arm/ptp_kvm.rst | 72 ++
 1 file changed, 72 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/ptp_kvm.rst

diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst 
b/Documentation/virt/kvm/arm/ptp_kvm.rst
new file mode 100644
index ..455591e2587a
--- /dev/null
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PTP_KVM support for arm64
+=
+
+PTP_KVM is used for time sync between guest and host in a high precison.
+It needs get wall time and counter value from host and transfer these data
+to guest via hypercall service. So one more hypercall service should be
+added.
+
+This new SMCCC hypercall will be defined as:
+
+* ARM_SMCCC_HYP_KVM_PTP_FUNC_ID: 0xC601
+
+As we only support 64-bits ptp_kvm client, so we choose SMC64/HVC64
+calling convention.
+
+ARM_SMCCC_HYP_KVM_PTP_FUNC_ID:
+
+   ==
+   Function ID:(uint32)0xC601
+   Arguments:  (uint32)ARM_PTP_PHY_COUNTER(1) or 
ARM_PTP_VIRT_COUNTER(0)
+   which indicate acquiring physical counter or
+   virtual counter respectively.
+   return value:   (uint64)NOT_SUPPORTED (-1) or two values of wall 
clock
+   time and counter cycle.
+   ==
+
+Why we need PTP_KVM?
+Currently, we offen use ntp (sync time with remote network clock) to sync time
+in VM. But the precision of ntp is subject to network delay so it's difficult
+to sync time in a high precision.
+
+kvm virtual ptp clock (ptp_kvm) offers another way to sync time in VM, in which
+the remote clock locates in the host instead of remote network clock. It
+targets to sync time between guest and host in virtualization environment and
+in this way, we can also keep the time of all the VMs running in the same host
+in sync. In general, the delay of communication between host and guest is quiet
+small, so ptp_kvm can offer time sync precision up to in order of nanosecond.
+Please keep in mind that ptp_kvm just limits itself to be a channel which
+transmit the remote clock from host to guest and leaves the time sync jobs to
+an application, eg. chrony, in usersapce of VM.
+
+How PTP_KVM works on arm64?
+After ptp_kvm initialized, there will be a new device node under /dev called
+ptp%d. A guest userspace service, like chrony, can use this device to get host
+walltime, sometimes also counter cycle, which depends on the service it calls.
+Then this guest userspace service can use those data to do the time sync for
+guest.
+Here is a rough sketch to show how kvm ptp clock works.
+
+||  |--|
+|   guest userspace  |  |  host|
+|ioctl -> /dev/ptp%d |  |  |
+|   ^   ||  |  |
+||  |  |
+|   |   | guest kernel   |  |  |
+|   |   V  (get host walltime/counter cycle)   |
+|  ptp_kvm -> hypercall - - - - - - - - - - ->hypercall service|
+| <- - - - - - - - - - - - |
+||  |--|
+
+1. time sync service in guest userspace call ptp device through /dev/ptp%d.
+2. ptp_kvm module in guest recive this request then invoke hypercall to
+   route into host kernel to request host walltime/counter cycle.
+3. ptp_kvm hypercall service in host response to the request and send data
+   back.
+4. ptp (not ptp_kvm, ptp_kvm implemented on it and not shown above) in guest
+   copy the data to userspace.
+
+This ptp_kvm implementation focuses itself to step 2 and 3 and step 2 works
+in guest comparing step 3 works in host kernel.
-- 
2.17.1



Re: [PATCH net v6 1/6] net: marvell: prestera: Add driver for Prestera family ASIC devices

2020-09-04 Thread Vadym Kochan
Hi Willem,

On Thu, Sep 03, 2020 at 05:22:24PM +0200, Willem de Bruijn wrote:
> On Wed, Sep 2, 2020 at 5:37 PM Vadym Kochan  wrote:
> >
> > Marvell Prestera 98DX326x integrates up to 24 ports of 1GbE with 8
> > ports of 10GbE uplinks or 2 ports of 40Gbps stacking for a largely
> > wireless SMB deployment.
> >
> > The current implementation supports only boards designed for the Marvell
> > Switchdev solution and requires special firmware.
> >
> > The core Prestera switching logic is implemented in prestera_main.c,
> > there is an intermediate hw layer between core logic and firmware. It is
> > implemented in prestera_hw.c, the purpose of it is to encapsulate hw
> > related logic, in future there is a plan to support more devices with
> > different HW related configurations.
> >
> > This patch contains only basic switch initialization and RX/TX support
> > over SDMA mechanism.
> >
> > Currently supported devices have DMA access range <= 32bit and require
> > ZONE_DMA to be enabled, for such cases SDMA driver checks if the skb
> > allocated in proper range supported by the Prestera device.
> >
> > Also meanwhile there is no TX interrupt support in current firmware
> > version so recycling work is scheduled on each xmit.
> >
> > Port's mac address is generated from the switch base mac which may be
> > provided via device-tree (static one or as nvme cell), or randomly
> > generated.
> >
> > Co-developed-by: Andrii Savka 
> > Signed-off-by: Andrii Savka 
> > Co-developed-by: Oleksandr Mazur 
> > Signed-off-by: Oleksandr Mazur 
> > Co-developed-by: Serhiy Boiko 
> > Signed-off-by: Serhiy Boiko 
> > Co-developed-by: Serhiy Pshyk 
> > Signed-off-by: Serhiy Pshyk 
> > Co-developed-by: Taras Chornyi 
> > Signed-off-by: Taras Chornyi 
> > Co-developed-by: Volodymyr Mytnyk 
> > Signed-off-by: Volodymyr Mytnyk 
> > Signed-off-by: Vadym Kochan 
> 
> > +int prestera_hw_port_cap_get(const struct prestera_port *port,
> > +struct prestera_port_caps *caps)
> > +{
> > +   struct prestera_msg_port_attr_resp resp;
> > +   struct prestera_msg_port_attr_req req = {
> > +   .attr = PRESTERA_CMD_PORT_ATTR_CAPABILITY,
> > +   .port = port->hw_id,
> > +   .dev = port->dev_id
> > +   };
> > +   int err;
> > +
> > +   err = prestera_cmd_ret(port->sw, PRESTERA_CMD_TYPE_PORT_ATTR_GET,
> > +  &req.cmd, sizeof(req), &resp.ret, 
> > sizeof(resp));
> 
> Here and elsewhere, why use a pointer to the first field in the struct
> vs the struct itself?
> 
> They are the same address, so it's fine, just a bit confusing as the
> size argument makes clear that the entire struct is to be copied.
> 
Well, initially it was a macro to simplify passing the different kind of
request structure. But after review I changed it to a function. The
struct prestera_msg_cmd is a common for all of the fw requests which
have to include it at the beginning. Eventually __prestera_cmd_ret() is
called to pass request buffer to the fw and struct prestera_msg_cmd is
used to check the response from fw. I can use 'void *' and typecast it
to struct prestera_msg_cmd but I'd like to keep type safety. 

> > +static int prestera_is_valid_mac_addr(struct prestera_port *port, u8 *addr)
> > +{
> > +   if (!is_valid_ether_addr(addr))
> > +   return -EADDRNOTAVAIL;
> > +
> > +   if (memcmp(port->sw->base_mac, addr, ETH_ALEN - 1))
> 
> Why ETH_ALEN - 1?
> 
This is the restriction of the port mac address, it must have base mac
address part at first 5 bytes.

> > +   return -EINVAL;
> > +
> > +   return 0;
> > +}
> > +
> > +static int prestera_port_set_mac_address(struct net_device *dev, void *p)
> > +{
> > +   struct prestera_port *port = netdev_priv(dev);
> > +   struct sockaddr *addr = p;
> > +   int err;
> > +
> > +   err = prestera_is_valid_mac_addr(port, addr->sa_data);
> > +   if (err)
> > +   return err;
> > +
> > +   err = prestera_hw_port_mac_set(port, addr->sa_data);
> > +   if (err)
> > +   return err;
> > +
> > +   memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
> 
> Is addr_len ever not ETH_ALEN for this device?

I will fix it to use ether_addr_copy.

> 
> > +static int prestera_sdma_buf_init(struct prestera_sdma *sdma,
> > + struct prestera_sdma_buf *buf)
> > +{
> > +   struct device *dma_dev = sdma->sw->dev->dev;
> > +   struct prestera_sdma_desc *desc;
> > +   dma_addr_t dma;
> > +
> > +   desc = dma_pool_alloc(sdma->desc_pool, GFP_DMA | GFP_KERNEL, &dma);
> > +   if (!desc)
> > +   return -ENOMEM;
> > +
> > +   if (dma + sizeof(struct prestera_sdma_desc) > sdma->dma_mask) {
> 
> Can this happen? The DMA API should take care of dev->dma_mask constraints.
> 
I will fix it, thanks.

> > +   dma_pool_free(sdma->desc_pool, desc, dma);
> > +   dev_err(dma_dev, "failed to alloc desc

Re: [PATCH net v6 1/6] net: marvell: prestera: Add driver for Prestera family ASIC devices

2020-09-04 Thread Vadym Kochan
On Thu, Sep 03, 2020 at 06:35:34PM +0300, Andy Shevchenko wrote:
> On Thu, Sep 3, 2020 at 6:23 PM Willem de Bruijn
>  wrote:
> > On Wed, Sep 2, 2020 at 5:37 PM Vadym Kochan  
> > wrote:
> 
> ...
> 
> > > +static int prestera_is_valid_mac_addr(struct prestera_port *port, u8 
> > > *addr)
> > > +{
> > > +   if (!is_valid_ether_addr(addr))
> > > +   return -EADDRNOTAVAIL;
> > > +
> > > +   if (memcmp(port->sw->base_mac, addr, ETH_ALEN - 1))
> >
> > Why ETH_ALEN - 1?
> 
> We even have a lot of helpers specifically for ethernet MACs.
> Starting from [1] till almost the end of the file. Here [2] can be
> used (or its unaligned counterpart).
> 
> [1]: 
> https://elixir.bootlin.com/linux/latest/source/include/linux/etherdevice.h#L67
> [2]: 
> https://elixir.bootlin.com/linux/latest/source/include/linux/etherdevice.h#L67
> 
> > > +   return -EINVAL;
> > > +
> > > +   return 0;
> > > +}
> 
> > > +   memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
> >
> > Is addr_len ever not ETH_ALEN for this device?
> 
> And if it is ETH_ALEN, here is [3].
> [3]: 
> https://elixir.bootlin.com/linux/latest/source/include/linux/etherdevice.h#L287

Thanks Andy, I missed this one to replace with ether_addr_copy().

> 
> -- 
> With Best Regards,
> Andy Shevchenko


Re: [PATCH net v6 5/6] net: marvell: prestera: Add Switchdev driver implementation

2020-09-04 Thread Vadym Kochan
On Thu, Sep 03, 2020 at 07:18:59PM +0200, Willem de Bruijn wrote:
> On Wed, Sep 2, 2020 at 5:07 PM Vadym Kochan  wrote:
> >
> > The following features are supported:
> >
> > - VLAN-aware bridge offloading
> > - VLAN-unaware bridge offloading
> > - FDB offloading (learning, ageing)
> > - Switchport configuration
> >
> > Currently there are some limitations like:
> >
> > - Only 1 VLAN-aware bridge instance supported
> > - FDB ageing timeout parameter is set globally per device
> >
> > Co-developed-by: Serhiy Boiko 
> > Signed-off-by: Serhiy Boiko 
> > Co-developed-by: Serhiy Pshyk 
> > Signed-off-by: Serhiy Pshyk 
> > Co-developed-by: Taras Chornyi 
> > Signed-off-by: Taras Chornyi 
> > Signed-off-by: Vadym Kochan 
> 
> 
> > +int prestera_switchdev_init(struct prestera_switch *sw)
> > +{
> > +   struct prestera_switchdev *swdev;
> > +   int err;
> > +
> > +   swdev = kzalloc(sizeof(*swdev), GFP_KERNEL);
> > +   if (!swdev)
> > +   return -ENOMEM;
> > +
> > +   sw->swdev = swdev;
> > +   swdev->sw = sw;
> > +
> > +   INIT_LIST_HEAD(&swdev->bridge_list);
> > +
> > +   swdev_wq = alloc_ordered_workqueue("%s_ordered", 0, "prestera_br");
> > +   if (!swdev_wq) {
> > +   err = -ENOMEM;
> > +   goto err_alloc_wq;
> > +   }
> > +
> > +   err = prestera_switchdev_handler_init(swdev);
> > +   if (err)
> > +   goto err_swdev_init;
> > +
> > +   err = prestera_fdb_init(sw);
> > +   if (err)
> > +   goto err_fdb_init;
> > +
> > +   return 0;
> > +
> > +err_fdb_init:
> > +err_swdev_init:
> > +err_alloc_wq:
> > +   kfree(swdev);
> > +
> > +   return err;
> > +}
> > +
> > +void prestera_switchdev_fini(struct prestera_switch *sw)
> > +{
> > +   struct prestera_switchdev *swdev = sw->swdev;
> > +
> > +   prestera_fdb_fini(sw);
> > +   prestera_switchdev_handler_fini(swdev);
> > +   destroy_workqueue(swdev_wq);
> 
> this cleanup is also needed on the error path of prestera_switchdev_init
> 

Thanks! I will fix it.

> > +   kfree(swdev);
> > +}


Re: [PATCH v2 net-next 6/9] bpf: helpers: add bpf_xdp_adjust_mb_header helper

2020-09-04 Thread Lorenzo Bianconi
> Lorenzo Bianconi wrote:
> > Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> > headers moving *offset* bytes from/to the second buffer to/from the
> > first one.
> > This helper can be used to move headers when the hw DMA SG is not able
> > to copy all the headers in the first fragment and split header and data
> > pages.
> > 
> > Signed-off-by: Lorenzo Bianconi 
> > ---
> >  include/uapi/linux/bpf.h   | 25 
> >  net/core/filter.c  | 54 ++
> >  tools/include/uapi/linux/bpf.h | 26 
> >  3 files changed, 95 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8dda13880957..c4a6d245619c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3571,11 +3571,25 @@ union bpf_attr {
> >   * value.
> >   *
> >   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> > - * Description
> > - * Read *size* bytes from user space address *user_ptr* 
> > and store
> > - * the data in *dst*. This is a wrapper of 
> > copy_from_user().
> > - * Return
> > - * 0 on success, or a negative error in case of failure.
> > + * Description
> > + * Read *size* bytes from user space address *user_ptr* and store
> > + * the data in *dst*. This is a wrapper of copy_from_user().
> > + *
> > + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> > + * Description
> > + * Adjust frame headers moving *offset* bytes from/to the second
> > + * buffer to/from the first one. This helper can be used to move
> > + * headers when the hw DMA SG does not copy all the headers in
> > + * the first fragment.

+ Eric to the discussion

> 
> This is confusing to read. Does this mean I can "move bytes to the second
> buffer from the first one" or "move bytes from the second buffer to the first
> one" And what are frame headers? I'm sure I can read below code and work
> it out, but users reading the header file should be able to parse this.

Our main goal with this helper is to fix the use-case where we request the hw
to put L2/L3/L4 headers (and all the TCP options) in the first fragment and TCP
data starting from the second fragment (headers split) but for some reasons
the hw puts the TCP options in the second fragment (if we understood correctly
this issue has been introduced by Eric @ NetDevConf 0x14).
bpf_xdp_adjust_mb_header() can fix this use-case moving bytes from the second 
fragment
to the first one (offset > 0) or from the first buffer to the second one 
(offset < 0).

> 
> Also we want to be able to read all data not just headers. Reading the
> payload of a TCP packet is equally important for many l7 load balancers.
> 

In order to avoid to slow down performances we require that eBPF sandbox can
read/write only buff0 in a xdp multi-buffer. The xdp program can only
perform some restricted changes to buff (n >= 1) (e.g. what we did in
bpf_xdp_adjust_mb_header()).
You can find the xdp multi-buff design principles here [0][1]

[0] 
https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
[1] 
http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html
 - XDP multi-buffers section (slide 40)

> > + *
> > + * A call to this helper is susceptible to change the underlying
> > + * packet buffer. Therefore, at load time, all checks on pointers
> > + * previously done by the verifier are invalidated and must be
> > + * performed again, if the helper is used in combination with
> > + * direct packet access.
> > + *
> > + * Return
> > + * 0 on success, or a negative error in case of failure.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)  \
> > FN(unspec), \
> > @@ -3727,6 +3741,7 @@ union bpf_attr {
> > FN(inode_storage_delete),   \
> > FN(d_path), \
> > FN(copy_from_user), \
> > +   FN(xdp_adjust_mb_header),   \
> > /* */
> >  
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which 
> > helper
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 47eef9a0be6a..ae6b10cf062d 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -3475,6 +3475,57 @@ static const struct bpf_func_proto 
> > bpf_xdp_adjust_head_proto = {
> > .arg2_type  = ARG_ANYTHING,
> >  };
> >  
> > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > +  int, offset)
> > +{
> > +   void *data_hard_end, *data_end;
> > +   struct skb_shared_info *sinfo;
> > +   int frag_offset, frag_len;
> > +   u8 *addr;
> > +
> > +   if (!xdp->mb)
> > +   return -EOPNOTSUPP;
> > +
> > +   sinfo = xdp_get_shared_info_from_buff(xdp);
> > +
> > +   frag_len = skb_frag_size(&sinfo->frags[0])

Re: [PATCH v2 net-next 7/9] bpf: helpers: add multibuffer support

2020-09-04 Thread Lorenzo Bianconi
> On Thu, Sep 03, 2020 at 10:58:51PM +0200, Lorenzo Bianconi wrote:
> > From: Sameeh Jubran 
> > 
> > The implementation is based on this [0] draft by Jesper D. Brouer.
> > 
> > Provided two new helpers:
> > 
> > * bpf_xdp_get_frag_count()
> > * bpf_xdp_get_frags_total_size()
> > 
> > [0] xdp mb design - 
> > https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
> > Signed-off-by: Sameeh Jubran 
> > Signed-off-by: Lorenzo Bianconi 
> > ---
> >  include/uapi/linux/bpf.h   | 14 
> >  net/core/filter.c  | 39 ++
> >  tools/include/uapi/linux/bpf.h | 14 
> >  3 files changed, 67 insertions(+)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index c4a6d245619c..53db75095306 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3590,6 +3590,18 @@ union bpf_attr {
> >   *
> >   * Return
> >   * 0 on success, or a negative error in case of failure.

[...]

> > +
> 
> I only quickly jumped through series and IMHO this helper should be
> rewritten/optimized in a way that we bail out as early as possible if
> !xdp->mb as the rest of the code on that condition will do nothing and i'm
> not sure if compiler would optimize it.
> 
> 
>   struct skb_shared_info *sinfo;
>   int nfrags, i;
>   int size = 0;
> 
>   if (!xdp->mb)
>   return 0;
> 
>   sinfo = xdp_get_shared_info_from_buff(xdp);
> 
>   nfrags = min(sinfo->nr_frags, MAX_SKB_FRAGS);
> 
>   for (i = 0; i < nfrags; i++)
>   size += skb_frag_size(&sinfo->frags[i]);
> 
>   return size;
> 
> Thoughts?

I agree.

Regards,
Lorenzo

> 
> 
> > +   return size;
> > +}
> > +
> > +const struct bpf_func_proto bpf_xdp_get_frags_total_size_proto = {
> > +   .func   = bpf_xdp_get_frags_total_size,
> > +   .gpl_only   = false,
> > +   .ret_type   = RET_INTEGER,
> > +   .arg1_type  = ARG_PTR_TO_CTX,
> > +};
> > +
> >  BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
> >  {
> > void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > @@ -6889,6 +6924,10 @@ xdp_func_proto(enum bpf_func_id func_id, const 
> > struct bpf_prog *prog)
> > return &bpf_xdp_adjust_tail_proto;
> > case BPF_FUNC_xdp_adjust_mb_header:
> > return &bpf_xdp_adjust_mb_header_proto;
> > +   case BPF_FUNC_xdp_get_frag_count:
> > +   return &bpf_xdp_get_frag_count_proto;
> > +   case BPF_FUNC_xdp_get_frags_total_size:
> > +   return &bpf_xdp_get_frags_total_size_proto;
> > case BPF_FUNC_fib_lookup:
> > return &bpf_xdp_fib_lookup_proto;
> >  #ifdef CONFIG_INET
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index 392d52a2ecef..dd4669096cbb 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -3591,6 +3591,18 @@ union bpf_attr {
> >   *
> >   * Return
> >   * 0 on success, or a negative error in case of failure.
> > + *
> > + * int bpf_xdp_get_frag_count(struct xdp_buff *xdp_md)
> > + * Description
> > + * Get the total number of frags for a given packet.
> > + * Return
> > + * The number of frags
> > + *
> > + * int bpf_xdp_get_frags_total_size(struct xdp_buff *xdp_md)
> > + * Description
> > + * Get the total size of frags for a given packet.
> > + * Return
> > + * The total size of frags for a given packet.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)  \
> > FN(unspec), \
> > @@ -3743,6 +3755,8 @@ union bpf_attr {
> > FN(d_path), \
> > FN(copy_from_user), \
> > FN(xdp_adjust_mb_header),   \
> > +   FN(xdp_get_frag_count), \
> > +   FN(xdp_get_frags_total_size),   \
> > /* */
> >  
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which 
> > helper
> > -- 
> > 2.26.2
> > 


signature.asc
Description: PGP signature


[PATCH net] cxgb4: Fix offset when clearing filter byte counters

2020-09-04 Thread Ganji Aravind
Pass the correct offset to clear the stale filter hit
bytes counter. Otherwise, the counter starts incrementing
from the stale information, instead of 0.

Fixes: 12b276fbf6e0 ("cxgb4: add support to create hash filters")
Signed-off-by: Ganji Aravind 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 650db92cb11c..481498585ead 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -1911,13 +1911,16 @@ int cxgb4_del_filter(struct net_device *dev, int 
filter_id,
 static int configure_filter_tcb(struct adapter *adap, unsigned int tid,
struct filter_entry *f)
 {
-   if (f->fs.hitcnts)
+   if (f->fs.hitcnts) {
set_tcb_field(adap, f, tid, TCB_TIMESTAMP_W,
- TCB_TIMESTAMP_V(TCB_TIMESTAMP_M) |
+ TCB_TIMESTAMP_V(TCB_TIMESTAMP_M),
+ TCB_TIMESTAMP_V(0ULL),
+ 1);
+   set_tcb_field(adap, f, tid, TCB_RTT_TS_RECENT_AGE_W,
  TCB_RTT_TS_RECENT_AGE_V(TCB_RTT_TS_RECENT_AGE_M),
- TCB_TIMESTAMP_V(0ULL) |
  TCB_RTT_TS_RECENT_AGE_V(0ULL),
  1);
+   }
 
if (f->fs.newdmac)
set_tcb_tflag(adap, f, tid, TF_CCTRL_ECE_S, 1,
-- 
2.26.2



Re: WARNING: ODEBUG bug in process_one_work (2)

2020-09-04 Thread Thomas Gleixner
On Wed, Sep 02 2020 at 11:18, syzbot wrote:

Cc+: Relevant maintainers

> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:4d41ead6 Merge tag 'block-5.9-2020-08-28' of git://git.ker..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=1196ce6190
> kernel config:  https://syzkaller.appspot.com/x/.config?x=978db74cb30aa994
> dashboard link: https://syzkaller.appspot.com/bug?extid=91923aae0b157bd6c0c5
> compiler:   gcc (GCC) 10.1.0-syz 20200507
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=11b1cbb690
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+91923aae0b157bd6c...@syzkaller.appspotmail.com
>
> [ cut here ]
> ODEBUG: free active (active state 0) object type: timer_list hint: 
> xprt_init_autodisconnect+0x0/0x150 include/linux/refcount.h:274
> WARNING: CPU: 1 PID: 8854 at lib/debugobjects.c:485 
> debug_print_object+0x160/0x250 lib/debugobjects.c:485

xprt->timer is still armed at the time when RCU frees xprt

> Kernel panic - not syncing: panic_on_warn set ...
> CPU: 1 PID: 8854 Comm: kworker/1:10 Not tainted 5.9.0-rc2-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Workqueue: events kfree_rcu_work
> Call Trace:
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x18f/0x20d lib/dump_stack.c:118
>  panic+0x2e3/0x75c kernel/panic.c:231
>  __warn.cold+0x20/0x4a kernel/panic.c:600
>  report_bug+0x1bd/0x210 lib/bug.c:198
>  handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234
>  exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254
>  asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536
> RIP: 0010:debug_print_object+0x160/0x250 lib/debugobjects.c:485
> Code: dd a0 26 94 88 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 bf 00 00 00 48 8b 
> 14 dd a0 26 94 88 48 c7 c7 00 1c 94 88 e8 52 38 a6 fd <0f> 0b 83 05 53 4f 13 
> 07 01 48 83 c4 20 5b 5d 41 5c 41 5d c3 48 89
> RSP: 0018:c9000b68fb28 EFLAGS: 00010082
> RAX:  RBX: 0003 RCX: 
> RDX: 8880a216c300 RSI: 815dafc7 RDI: f520016d1f57
> RBP: 0001 R08: 0001 R09: 8880ae720f8b
> R10:  R11: 35383854 R12: 89be2ea0
> R13: 81638450 R14: dead0100 R15: dc00
>  __debug_check_no_obj_freed lib/debugobjects.c:967 [inline]
>  debug_check_no_obj_freed+0x301/0x41c lib/debugobjects.c:998
>  kmem_cache_free_bulk+0x9e/0x190 mm/slab.c:3718
>  kfree_bulk include/linux/slab.h:411 [inline]
>  kfree_rcu_work+0x506/0x8c0 kernel/rcu/tree.c:3150
>  process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
>  worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
>  kthread+0x3b5/0x4a0 kernel/kthread.c:292
>  ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
>
>
> ---
> This report is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkal...@googlegroups.com.
>
> syzbot will keep track of this issue. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
> syzbot can test patches for this issue, for details see:
> https://goo.gl/tpsmEJ#testing-patches


Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for Octeontx2

2020-09-04 Thread Jiri Pirko
Fri, Sep 04, 2020 at 10:49:45AM CEST, sgout...@marvell.com wrote:
>
>
>> -Original Message-
>> From: Jiri Pirko 
>> Sent: Friday, September 4, 2020 2:07 PM
>> To: Sunil Kovvuri Goutham 
>> Cc: Jakub Kicinski ; sundeep.l...@gmail.com;
>> da...@davemloft.net; netdev@vger.kernel.org; Subbaraya Sundeep
>> Bhatta 
>> Subject: Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for
>> Octeontx2
>> 
>> Fri, Sep 04, 2020 at 07:39:54AM CEST, sgout...@marvell.com wrote:
>> >
>> >
>> >> -Original Message-
>> >> From: Jakub Kicinski 
>> >> Sent: Friday, September 4, 2020 12:48 AM
>> >> To: sundeep.l...@gmail.com
>> >> Cc: da...@davemloft.net; netdev@vger.kernel.org; Sunil Kovvuri
>> >> Goutham ; Subbaraya Sundeep Bhatta
>> >> 
>> >> Subject: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints
>> >> for
>> >> Octeontx2
>> >>
>> >> External Email
>> >>
>> >> -
>> >> - On Thu,  3 Sep 2020 12:48:16 +0530 sundeep.l...@gmail.com wrote:
>> >> > From: Subbaraya Sundeep 
>> >> >
>> >> > This patchset adds tracepoints support for mailbox.
>> >> > In Octeontx2, PFs and VFs need to communicate with AF for
>> >> > allocating and freeing resources. Once all the configuration is
>> >> > done by AF for a PF/VF then packet I/O can happen on PF/VF queues.
>> >> > When an interface is brought up many mailbox messages are sent to
>> >> > AF for initializing queues. Say a VF is brought up then each
>> >> > message is sent to PF and PF forwards to AF and response also traverses
>> from AF to PF and then VF.
>> >> > To aid debugging, tracepoints are added at places where messages
>> >> > are allocated, sent and message interrupts.
>> >> > Below is the trace of one of the messages from VF to AF and AF
>> >> > response back to VF:
>> >>
>> >> Could you use the devlink tracepoint? trace_devlink_hwmsg() ?
>> >
>> >Thanks for the suggestion.
>> >In our case the mailbox is central to 3 different drivers and there
>> >would be a 4th one once crypto driver is accepted. We cannot add
>> >devlink to all of them inorder to use the devlink trace points.
>> 
>> I guess you have 1 pci device, right? Devlink instance is created per pci
>> device.
>> 
>
>No, there are 3 drivers registering to 3 PCI device IDs and there can be many
>instances of the same devices. So there can be 10's of instances of AF, PF and 
>VFs.

So you can still have per-pci device devlink instance and use the
tracepoint Jakub suggested.


>
>Thanks,
>Sunil.


RE: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for Octeontx2

2020-09-04 Thread Sunil Kovvuri Goutham



> -Original Message-
> From: Jiri Pirko 
> Sent: Friday, September 4, 2020 5:41 PM
> To: Sunil Kovvuri Goutham 
> Cc: Jakub Kicinski ; sundeep.l...@gmail.com;
> da...@davemloft.net; netdev@vger.kernel.org; Subbaraya Sundeep
> Bhatta 
> Subject: Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints for
> Octeontx2
> 
> Fri, Sep 04, 2020 at 10:49:45AM CEST, sgout...@marvell.com wrote:
> >
> >
> >> -Original Message-
> >> From: Jiri Pirko 
> >> Sent: Friday, September 4, 2020 2:07 PM
> >> To: Sunil Kovvuri Goutham 
> >> Cc: Jakub Kicinski ; sundeep.l...@gmail.com;
> >> da...@davemloft.net; netdev@vger.kernel.org; Subbaraya Sundeep
> Bhatta
> >> 
> >> Subject: Re: [EXT] Re: [net-next PATCH 0/2] Introduce mbox
> >> tracepoints for
> >> Octeontx2
> >>
> >> Fri, Sep 04, 2020 at 07:39:54AM CEST, sgout...@marvell.com wrote:
> >> >
> >> >
> >> >> -Original Message-
> >> >> From: Jakub Kicinski 
> >> >> Sent: Friday, September 4, 2020 12:48 AM
> >> >> To: sundeep.l...@gmail.com
> >> >> Cc: da...@davemloft.net; netdev@vger.kernel.org; Sunil Kovvuri
> >> >> Goutham ; Subbaraya Sundeep Bhatta
> >> >> 
> >> >> Subject: [EXT] Re: [net-next PATCH 0/2] Introduce mbox tracepoints
> >> >> for
> >> >> Octeontx2
> >> >>
> >> >> External Email
> >> >>
> >> >> --
> >> >> ---
> >> >> - On Thu,  3 Sep 2020 12:48:16 +0530 sundeep.l...@gmail.com wrote:
> >> >> > From: Subbaraya Sundeep 
> >> >> >
> >> >> > This patchset adds tracepoints support for mailbox.
> >> >> > In Octeontx2, PFs and VFs need to communicate with AF for
> >> >> > allocating and freeing resources. Once all the configuration is
> >> >> > done by AF for a PF/VF then packet I/O can happen on PF/VF
> queues.
> >> >> > When an interface is brought up many mailbox messages are sent
> >> >> > to AF for initializing queues. Say a VF is brought up then each
> >> >> > message is sent to PF and PF forwards to AF and response also
> >> >> > traverses
> >> from AF to PF and then VF.
> >> >> > To aid debugging, tracepoints are added at places where messages
> >> >> > are allocated, sent and message interrupts.
> >> >> > Below is the trace of one of the messages from VF to AF and AF
> >> >> > response back to VF:
> >> >>
> >> >> Could you use the devlink tracepoint? trace_devlink_hwmsg() ?
> >> >
> >> >Thanks for the suggestion.
> >> >In our case the mailbox is central to 3 different drivers and there
> >> >would be a 4th one once crypto driver is accepted. We cannot add
> >> >devlink to all of them inorder to use the devlink trace points.
> >>
> >> I guess you have 1 pci device, right? Devlink instance is created per
> >> pci device.
> >>
> >
> >No, there are 3 drivers registering to 3 PCI device IDs and there can
> >be many instances of the same devices. So there can be 10's of instances of
> AF, PF and VFs.
> 
> So you can still have per-pci device devlink instance and use the tracepoint
> Jakub suggested.
> 

Two things
- As I mentioned above, there is a Crypto driver which uses the same mbox APIs
  which is in the process of upstreaming. There also we would need trace points.
  Not sure registering to devlink just for the sake of tracepoint is proper. 

- The devlink trace message is like this

   TRACE_EVENT(devlink_hwmsg,
 . . .
TP_printk("bus_name=%s dev_name=%s driver_name=%s incoming=%d type=%lu 
buf=0x[%*phD] len=%zu",
  __get_str(bus_name), __get_str(dev_name),
  __get_str(driver_name), __entry->incoming, __entry->type,
  (int) __entry->len, __get_dynamic_array(buf), __entry->len)
   );

   Whatever debug message we want as output doesn't fit into this.

Thanks,
Sunil. 


Re: [PATCH v2] net: dsa: microchip: look for phy-mode in port nodes

2020-09-04 Thread Alexandre Belloni
On 04/09/2020 10:14:42+0200, Helmut Grohne wrote:
> Documentation/devicetree/bindings/net/dsa/dsa.txt says that the phy-mode
> property should be specified on port nodes. However, the microchip
> drivers read it from the switch node.
> 
> Let the driver use the per-port property and fall back to the old
> location with a warning.
> 
> Fix in-tree users.
> 
> Signed-off-by: Helmut Grohne 
Acked-by: Alexandre Belloni 

> Link: https://lore.kernel.org/netdev/20200617082235.GA1523@laureti-dev/
> ---
>  arch/arm/boot/dts/at91-sama5d2_icp.dts |  2 +-
>  drivers/net/dsa/microchip/ksz8795.c| 17 +++-
>  drivers/net/dsa/microchip/ksz9477.c| 28 +-
>  drivers/net/dsa/microchip/ksz_common.c | 13 +++-
>  drivers/net/dsa/microchip/ksz_common.h |  3 ++-
>  5 files changed, 45 insertions(+), 18 deletions(-)
> 
> Changes since v1:
>  * Preserve the reverse christmas tree ordering of local variables.
>Reported by David Miller.
> 
> Reason for resending v1:
>  * While Andrew Lunn agreed to the semantic change, he found the
>implementation unnecessarily complex. He suggested going without a
>per-port interface attribute, but that happened to not work out. The
>information of which port will become the cpu port is only realized
>in a later initialization step.
> 
> There were no further replies, so here goes a v2 with minimal changes.
> 
> Helmut
> 
> diff --git a/arch/arm/boot/dts/at91-sama5d2_icp.dts 
> b/arch/arm/boot/dts/at91-sama5d2_icp.dts
> index 8d19925fc09e..6783cf16ff81 100644
> --- a/arch/arm/boot/dts/at91-sama5d2_icp.dts
> +++ b/arch/arm/boot/dts/at91-sama5d2_icp.dts
> @@ -116,7 +116,6 @@
>   switch0: ksz8563@0 {
>   compatible = "microchip,ksz8563";
>   reg = <0>;
> - phy-mode = "mii";
>   reset-gpios = <&pioA PIN_PD4 GPIO_ACTIVE_LOW>;
>  
>   spi-max-frequency = <50>;
> @@ -140,6 +139,7 @@
>   reg = <2>;
>   label = "cpu";
>   ethernet = <&macb0>;
> + phy-mode = "mii";
>   fixed-link {
>   speed = <100>;
>   full-duplex;
> diff --git a/drivers/net/dsa/microchip/ksz8795.c 
> b/drivers/net/dsa/microchip/ksz8795.c
> index 8f1d15ea15d9..cae77eafd533 100644
> --- a/drivers/net/dsa/microchip/ksz8795.c
> +++ b/drivers/net/dsa/microchip/ksz8795.c
> @@ -932,11 +932,18 @@ static void ksz8795_port_setup(struct ksz_device *dev, 
> int port, bool cpu_port)
>   ksz_port_cfg(dev, port, P_PRIO_CTRL, PORT_802_1P_ENABLE, true);
>  
>   if (cpu_port) {
> + if (!p->interface && dev->compat_interface) {
> + dev_warn(dev->dev,
> +  "Using legacy switch \"phy-mode\" missing on 
> port %d node. Please update your device tree.\n",
> +  port);
> + p->interface = dev->compat_interface;
> + }
> +
>   /* Configure MII interface for proper network communication. */
>   ksz_read8(dev, REG_PORT_5_CTRL_6, &data8);
>   data8 &= ~PORT_INTERFACE_TYPE;
>   data8 &= ~PORT_GMII_1GPS_MODE;
> - switch (dev->interface) {
> + switch (p->interface) {
>   case PHY_INTERFACE_MODE_MII:
>   p->phydev.speed = SPEED_100;
>   break;
> @@ -952,11 +959,11 @@ static void ksz8795_port_setup(struct ksz_device *dev, 
> int port, bool cpu_port)
>   default:
>   data8 &= ~PORT_RGMII_ID_IN_ENABLE;
>   data8 &= ~PORT_RGMII_ID_OUT_ENABLE;
> - if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> - dev->interface == PHY_INTERFACE_MODE_RGMII_RXID)
> + if (p->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> + p->interface == PHY_INTERFACE_MODE_RGMII_RXID)
>   data8 |= PORT_RGMII_ID_IN_ENABLE;
> - if (dev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> - dev->interface == PHY_INTERFACE_MODE_RGMII_TXID)
> + if (p->interface == PHY_INTERFACE_MODE_RGMII_ID ||
> + p->interface == PHY_INTERFACE_MODE_RGMII_TXID)
>   data8 |= PORT_RGMII_ID_OUT_ENABLE;
>   data8 |= PORT_GMII_1GPS_MODE;
>   data8 |= PORT_INTERFACE_RGMII;
> diff --git a/drivers/net/dsa/microchip/ksz9477.c 
> b/drivers/net/dsa/microchip/ksz9477.c
> index 3cb22d149813..89e8934bc60b 100644
> --- a/drivers/net/dsa/microchip/ksz9477.c
> +++ b/drivers/net/dsa/microchip/ksz9477.c
> @@ -1208,7 +1208,7 @@ static void ksz9

[PATCH net-next] netfilter: ebt_stp: Remove unused macro BPDU_TYPE_TCN

2020-09-04 Thread Wang Hai
BPDU_TYPE_TCN is never used after it was introduced.
So better to remove it.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 net/bridge/netfilter/ebt_stp.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 0d6d20c9105e..8f68afda5f81 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -15,7 +15,6 @@
 #include 
 
 #define BPDU_TYPE_CONFIG 0
-#define BPDU_TYPE_TCN 0x80
 
 struct stp_header {
u8 dsap;
-- 
2.17.1



[PATCH net-next] caif: Remove duplicate macro SRVL_CTRL_PKT_SIZE

2020-09-04 Thread Wang Hai
Remove SRVL_CTRL_PKT_SIZE which is defined more than once.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 net/caif/cfsrvl.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index d0a4d0ac7045..9cef9496a707 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -21,7 +21,6 @@
 #define SRVL_FLOW_OFF 0x81
 #define SRVL_FLOW_ON  0x80
 #define SRVL_SET_PIN  0x82
-#define SRVL_CTRL_PKT_SIZE 1
 
 #define container_obj(layr) container_of(layr, struct cfsrvl, layer)
 
-- 
2.17.1



[PATCH net-next] NFC: digital: Remove two unused macroes

2020-09-04 Thread Wang Hai
DIGITAL_NFC_DEP_REQ_RES_TAILROOM is never used after it was introduced.
DIGITAL_NFC_DEP_REQ_RES_HEADROOM is no more used after below
commit e8e7f4217564 ("NFC: digital: Remove useless call to skb_reserve()")
Remove them.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 net/nfc/digital_dep.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index 304b1a9bb18a..5971fb6f51cc 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -38,9 +38,6 @@
 
 #define DIGITAL_GB_BIT 0x02
 
-#define DIGITAL_NFC_DEP_REQ_RES_HEADROOM   2 /* SoD: [SB (NFC-A)] + LEN */
-#define DIGITAL_NFC_DEP_REQ_RES_TAILROOM   2 /* EoD: 2-byte CRC */
-
 #define DIGITAL_NFC_DEP_PFB_TYPE(pfb) ((pfb) & 0xE0)
 
 #define DIGITAL_NFC_DEP_PFB_TIMEOUT_BIT 0x10
-- 
2.17.1



[PATCH net-next] net/packet: Remove unused macro BLOCK_PRIV

2020-09-04 Thread Wang Hai
BPDU_TYPE_TCN is never used after it was introduced.
So better to remove it.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 net/packet/af_packet.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index da8254e680f9..c430672c6a67 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -177,7 +177,6 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
 #define BLOCK_LEN(x)   ((x)->hdr.bh1.blk_len)
 #define BLOCK_SNUM(x)  ((x)->hdr.bh1.seq_num)
 #define BLOCK_O2PRIV(x)((x)->offset_to_priv)
-#define BLOCK_PRIV(x)  ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
 
 struct packet_sock;
 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
-- 
2.17.1



Re: [PATCH bpf-next 1/2] libbpf: fix another __u64 cast in printf

2020-09-04 Thread Daniel Borkmann

On 9/4/20 6:16 AM, Andrii Nakryiko wrote:

Another issue of __u64 needing either %lu or %llu, depending on the
architecture. Fix with cast to `unsigned long long`.

Fixes: 7e06aad52929 ("libbpf: Add multi-prog section support for struct_ops")
Signed-off-by: Andrii Nakryiko 


Applied, thanks!


[PATCH net-next] rxrpc: Remove unused macro rxrpc_min_rtt_wlen

2020-09-04 Thread Wang Hai
rxrpc_min_rtt_wlen is never used after it was introduced.
So better to remove it.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 net/rxrpc/rtt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index 928d8b34a3ee..a056c9bcf1d6 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -14,7 +14,6 @@
 #define RXRPC_RTO_MAX  ((unsigned)(120 * HZ))
 #define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ))  /* RFC6298 2.1 initial RTO 
value*/
 #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-#define rxrpc_min_rtt_wlen 300 /* As sysctl_tcp_min_rtt_wlen */
 
 static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
 {
-- 
2.17.1



[PATCH net-next] can: kvaser_pciefd: Remove unused macro KVASER_PCIEFD_KCAN_CTRL_EFRAME

2020-09-04 Thread Wang Hai
KVASER_PCIEFD_KCAN_CTRL_EFRAME is never used after it was introduced.
So better to remove it.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 drivers/net/can/kvaser_pciefd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index 6f766918211a..c0b18ff107c7 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -131,7 +131,6 @@ MODULE_DESCRIPTION("CAN driver for Kvaser CAN/PCIe 
devices");
 
 /* Kvaser KCAN definitions */
 #define KVASER_PCIEFD_KCAN_CTRL_EFLUSH (4 << 29)
-#define KVASER_PCIEFD_KCAN_CTRL_EFRAME (5 << 29)
 
 #define KVASER_PCIEFD_KCAN_CMD_SEQ_SHIFT 16
 /* Request status packet */
-- 
2.17.1



[PATCH net-next] can: peak_canfd: Remove unused macros

2020-09-04 Thread Wang Hai
CANFD_CLK_SEL_DIV_MASK and CANFD_OPTIONS_SET are
never used after they were introduced. Remove them.

Reported-by: Hulk Robot 
Signed-off-by: Wang Hai 
---
 drivers/net/can/peak_canfd/peak_pciefd_main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/can/peak_canfd/peak_pciefd_main.c 
b/drivers/net/can/peak_canfd/peak_pciefd_main.c
index 9469d4421afe..5f0f39d2fa28 100644
--- a/drivers/net/can/peak_canfd/peak_pciefd_main.c
+++ b/drivers/net/can/peak_canfd/peak_pciefd_main.c
@@ -83,7 +83,6 @@ MODULE_LICENSE("GPL v2");
 #define CANFD_MISC_TS_RST  0x0001  /* timestamp cnt rst */
 
 /* CAN-FD channel Clock SELector Source & DIVider */
-#define CANFD_CLK_SEL_DIV_MASK 0x0007
 #define CANFD_CLK_SEL_DIV_60MHZ0x  /* SRC=240MHz 
only */
 #define CANFD_CLK_SEL_DIV_40MHZ0x0001  /* SRC=240MHz 
only */
 #define CANFD_CLK_SEL_DIV_30MHZ0x0002  /* SRC=240MHz 
only */
@@ -116,8 +115,6 @@ MODULE_LICENSE("GPL v2");
 #define CANFD_CTL_IRQ_CL_DEF   16  /* Rx msg max nb per IRQ in Rx DMA */
 #define CANFD_CTL_IRQ_TL_DEF   10  /* Time before IRQ if < CL (x100 µs) */
 
-#define CANFD_OPTIONS_SET  (CANFD_OPTION_ERROR | CANFD_OPTION_BUSLOAD)
-
 /* Tx anticipation window (link logical address should be aligned on 2K
  * boundary)
  */
-- 
2.17.1



[PATCH RFC 2/7] bridge: cfm: Add BRIDGE_CFM to Kconfig.

2020-09-04 Thread Henrik Bjoernlund
This makes it possible to include or exclude the CFM
protocol according to 802.1Q section 12.14.

Signed-off-by: Henrik Bjoernlund  
---
 net/bridge/Kconfig  | 11 +++
 net/bridge/br_device.c  |  3 +++
 net/bridge/br_private.h |  3 +++
 3 files changed, 17 insertions(+)

diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index 80879196560c..3c8ded7d3e84 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -73,3 +73,14 @@ config BRIDGE_MRP
  Say N to exclude this support and reduce the binary size.
 
  If unsure, say N.
+
+config BRIDGE_CFM
+   bool "CFM protocol"
+   depends on BRIDGE
+   help
+ If you say Y here, then the Ethernet bridge will be able to run CFM
+ protocol according to 802.1Q section 12.14
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index a9232db03108..d12f5626a4b1 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -476,6 +476,9 @@ void br_dev_setup(struct net_device *dev)
INIT_LIST_HEAD(&br->ftype_list);
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
INIT_LIST_HEAD(&br->mrp_list);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+   INIT_LIST_HEAD(&br->mep_list);
 #endif
spin_lock_init(&br->hash_lock);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e67c6d9e8bea..6294a3e51a33 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -445,6 +445,9 @@ struct net_bridge {
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
struct list_headmrp_list;
 #endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+   struct list_headmep_list;
+#endif
 };
 
 struct br_input_skb_cb {
-- 
2.28.0



[PATCH RFC 1/7] net: bridge: extend the process of special frames

2020-09-04 Thread Henrik Bjoernlund
This patch extends the processing of frames in the bridge. Currently MRP
frames needs special processing and the current implementation doesn't
allow a nice way to process different frame types. Therefore try to
improve this by adding a list that contains frame types that need
special processing. This list is iterated for each input frame and if
there is a match based on frame type then these functions will be called
and decide what to do with the frame. It can process the frame then the
bridge doesn't need to do anything or don't process so then the bridge
will do normal forwarding.

Signed-off-by: Henrik Bjoernlund  
---
 net/bridge/br_device.c  |  1 +
 net/bridge/br_input.c   | 31 ++-
 net/bridge/br_mrp.c | 19 +++
 net/bridge/br_private.h | 18 --
 4 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 9a2fb4aa1a10..a9232db03108 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -473,6 +473,7 @@ void br_dev_setup(struct net_device *dev)
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
INIT_HLIST_HEAD(&br->fdb_list);
+   INIT_LIST_HEAD(&br->ftype_list);
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
INIT_LIST_HEAD(&br->mrp_list);
 #endif
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 59a318b9f646..0f475b21094c 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -254,6 +254,21 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct 
sk_buff **pskb)
return RX_HANDLER_CONSUMED;
 }
 
+/* Return 0 if the frame was not processed otherwise 1
+ * note: already called with rcu_read_lock
+ */
+static int br_process_frame_type(struct net_bridge_port *p,
+struct sk_buff *skb)
+{
+   struct br_frame_type *tmp;
+
+   list_for_each_entry_rcu(tmp, &p->br->ftype_list, list) {
+   if (unlikely(tmp->type == skb->protocol))
+   return tmp->func(p, skb);
+   }
+   return 0;
+}
+
 /*
  * Return NULL if skb is handled
  * note: already called with rcu_read_lock
@@ -343,7 +358,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff 
**pskb)
}
}
 
-   if (unlikely(br_mrp_process(p, skb)))
+   if (unlikely(br_process_frame_type(p, skb)))
return RX_HANDLER_PASS;
 
 forward:
@@ -380,3 +395,17 @@ rx_handler_func_t *br_get_rx_handler(const struct 
net_device *dev)
 
return br_handle_frame;
 }
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+   list_add_rcu(&ft->list, &br->ftype_list);
+}
+
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+   struct br_frame_type *tmp;
+
+   list_for_each_entry(tmp, &br->ftype_list, list)
+   if (ft == tmp)
+   list_del_rcu(&ft->list);
+}
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index b36689e6e7cb..0428e1785041 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -6,6 +6,13 @@
 static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 };
 static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 
};
 
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
+
+static struct br_frame_type mrp_frame_type __read_mostly = {
+   .type = cpu_to_be16(ETH_P_MRP),
+   .func = br_mrp_process,
+};
+
 static bool br_mrp_is_ring_port(struct net_bridge_port *p_port,
struct net_bridge_port *s_port,
struct net_bridge_port *port)
@@ -445,6 +452,9 @@ static void br_mrp_del_impl(struct net_bridge *br, struct 
br_mrp *mrp)
 
list_del_rcu(&mrp->list);
kfree_rcu(mrp, rcu);
+
+   if (list_empty(&br->mrp_list))
+   br_del_frame(br, &mrp_frame_type);
 }
 
 /* Adds a new MRP instance.
@@ -493,6 +503,9 @@ int br_mrp_add(struct net_bridge *br, struct 
br_mrp_instance *instance)
spin_unlock_bh(&br->lock);
rcu_assign_pointer(mrp->s_port, p);
 
+   if (list_empty(&br->mrp_list))
+   br_add_frame(br, &mrp_frame_type);
+
INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired);
INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired);
list_add_tail_rcu(&mrp->list, &br->mrp_list);
@@ -1172,15 +1185,13 @@ static int br_mrp_rcv(struct net_bridge_port *p,
  * normal forwarding.
  * note: already called with rcu_read_lock
  */
-int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
 {
/* If there is no MRP instance do normal forwarding */
if (likely(!(p->flags & BR_MRP_AWARE)))
goto out;
 
-   if (unlikely(skb->protocol == htons(ETH_P_MRP)))
-   return br_mrp_rcv(p, skb, p->dev);
-
+   return br_mrp_rcv(p,

[PATCH RFC 3/7] bridge: uapi: cfm: Added EtherType used by the CFM protocol.

2020-09-04 Thread Henrik Bjoernlund
This EtherType is used by all CFM protocal frames transmitted
according to 802.1Q section 12.14.

Signed-off-by: Henrik Bjoernlund  
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index d6de2b167448..a0b637911d3c 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -99,6 +99,7 @@
 #define ETH_P_1588 0x88F7  /* IEEE 1588 Timesync */
 #define ETH_P_NCSI 0x88F8  /* NCSI protocol*/
 #define ETH_P_PRP  0x88FB  /* IEC 62439-3 PRP/HSRv0*/
+#define ETH_P_CFM  0x8902  /* Connectivity Fault Management */
 #define ETH_P_FCOE 0x8906  /* Fibre Channel over Ethernet  */
 #define ETH_P_IBOE 0x8915  /* Infiniband over Ethernet */
 #define ETH_P_TDLS 0x890D  /* TDLS */
-- 
2.28.0



[PATCH RFC 0/7] net: bridge: cfm: Add support for Connectivity Fault Management(CFM)

2020-09-04 Thread Henrik Bjoernlund
Connectivity Fault Management (CFM) is defined in 802.1Q section 12.14.

Connectivity Fault Management (CFM) comprises capabilities for
detecting, verifying, and isolating connectivity failures in
Virtual Bridged Networks. These capabilities can be used in
networks operated by multiple independent organizations, each
with restricted management access to each other’s equipment.

CFM functions are partitioned as follows:
— Path discovery
— Fault detection
— Fault verification and isolation
— Fault notification
— Fault recovery

The primary CFM protocol shims are called Maintenance Points (MPs).
A MP can be either a MEP or a MHF.
The MEP:
-It is the Maintenance association End Point
 described in 802.1Q section 19.2.
-It is created on a specific level (1-7) and is assuring
 that no CFM frames are passing through this MEP on lower levels.
-It initiates and terminates/validates CFM frames on its level.
-It can only exist on a port that is related to a bridge.
The MHF:
-It is the Maintenance Domain Intermediate Point
 (MIP) Half Function (MHF) described in 802.1Q section 19.3.
-It is created on a specific level (1-7).
-It is extracting/injecting certain CFM frame on this level.
-It can only exist on a port that is related to a bridge.
-Currently not supported.

There are defined the following CFM protocol functions:
-Continuity Check
-Loopback. Currently not supported.
-Linktrace. Currently not supported.

This CFM component supports create/delete of MEP instances and
configuration of the different CFM protocols. Also status information
can be fetched and delivered through notification due to defect status
change.

The user interacts with CFM using the 'cfm' user space client program, the
client talks with the kernel using netlink. The kernel will try to offload
the requests to the HW via switchdev API (not implemented yet).

Any notification emitted by CFM from the kernel can be monitored in user
space by starting 'cfm_server' program.

Currently this 'cfm' and 'cfm_server' programs are standalone placed in a
cfm repository https://github.com/microchip-ung/cfm but it is considered
to integrate this into 'iproute2'.

Reviewed-by: Horatiu Vultur  
Signed-off-by: Henrik Bjoernlund  

Henrik Bjoernlund (7):
  net: bridge: extend the process of special frames
  bridge: cfm: Add BRIDGE_CFM to Kconfig.
  bridge: uapi: cfm: Added EtherType used by the CFM protocol.
  bridge: cfm: Kernel space implementation of CFM.
  bridge: cfm: Netlink Interface.
  bridge: cfm: Netlink Notifications.
  bridge: cfm: Bridge port remove.

 include/uapi/linux/cfm_bridge.h |  75 +++
 include/uapi/linux/if_bridge.h  | 125 +
 include/uapi/linux/if_ether.h   |   1 +
 include/uapi/linux/rtnetlink.h  |   2 +
 net/bridge/Kconfig  |  11 +
 net/bridge/Makefile |   2 +
 net/bridge/br_cfm.c | 936 
 net/bridge/br_cfm_netlink.c | 690 +++
 net/bridge/br_device.c  |   4 +
 net/bridge/br_if.c  |   1 +
 net/bridge/br_input.c   |  31 +-
 net/bridge/br_mrp.c |  19 +-
 net/bridge/br_netlink.c | 126 -
 net/bridge/br_private.h |  82 ++-
 net/bridge/br_private_cfm.h | 242 +
 15 files changed, 2326 insertions(+), 21 deletions(-)
 create mode 100644 include/uapi/linux/cfm_bridge.h
 create mode 100644 net/bridge/br_cfm.c
 create mode 100644 net/bridge/br_cfm_netlink.c
 create mode 100644 net/bridge/br_private_cfm.h

-- 
2.28.0



[PATCH RFC 4/7] bridge: cfm: Kernel space implementation of CFM.

2020-09-04 Thread Henrik Bjoernlund
This is the implementation of the CFM protocol according to
802.1Q section 12.14.

Connectivity Fault Management (CFM) comprises capabilities for
detecting, verifying, and isolating connectivity failures in
Virtual Bridged Networks. These capabilities can be used in
networks operated by multiple independent organizations, each
with restricted management access to each other’s equipment.

CFM functions are partitioned as follows:
- Path discovery
- Fault detection
- Fault verification and isolation
- Fault notification
- Fault recovery

Interface consists of these functions:
br_cfm_mep_create()
br_cfm_mep_delete()
br_cfm_mep_config_set()
br_cfm_mep_status_get()
br_cfm_mep_counters_get()
br_cfm_mep_counters_clear()
br_cfm_cc_config_set()
br_cfm_cc_peer_mep_add()
br_cfm_cc_peer_mep_remove()
br_cfm_cc_rdi_set()
br_cfm_cc_ccm_tx()
br_cfm_cc_status_get()
br_cfm_cc_counters_get()
br_cfm_cc_counters_clear()
br_cfm_cc_peer_status_get()

A MEP instance is created by br_cfm_mep_create()
-It is the Maintenance association End Point
 described in 802.1Q section 19.2.
-It is created on a specific level (1-7) and is assuring
 that no CFM frames are passing through this MEP on lower levels.
-It initiates and validates CFM frames on its level.
-It can only exist on a port that is related to a bridge.
-Attributes given cannot be changed until the instance is
 deleted.

A MEP instance can be deleted by br_cfm_mep_delete().

A created MEP instance has attributes that can be
configured by br_cfm_mep_config_set().

A MEP contain status and counter information that can be
retrieved by br_cfm_mep_status_get() and
br_cfm_mep_counters_get().

A MEP counters can be cleared by br_cfm_mep_counters_clear().

A MEP Continuity Check feature can be configured by
br_cfm_cc_config_set()
The Continuity Check Receiver state machine can be
enabled and disabled.
According to 802.1Q section 19.2.8

A MEP can have Peer MEPs added and removed by
br_cfm_cc_peer_mep_add() and br_cfm_cc_peer_mep_remove()
The Continuity Check feature can maintain connectivity
status on each added Peer MEP.

A MEP can be configured to start or stop transmission of CCM frames by
br_cfm_cc_ccm_tx()
The CCM will be transmitted for a selected period in seconds.
Must call this function before timeout to keep transmission alive.

A MEP transmitting CCM can be configured with inserted RDI in PDU by
br_cfm_cc_rdi_set()

A MEP contain Continuity Check status and counter information
that can be retrieved by br_cfm_cc_status_get() and
br_cfm_cc_counters_get().

A MEP Continuity Check counters can be cleared
by br_cfm_cc_counters_clear().

A MEP contain Peer MEP Continuity Check status information that
can be retrieved by br_cfm_cc_peer_status_get().

Signed-off-by: Henrik Bjoernlund  
---
 include/uapi/linux/cfm_bridge.h |  75 +++
 net/bridge/Makefile |   2 +
 net/bridge/br_cfm.c | 880 
 net/bridge/br_private.h |  16 +
 net/bridge/br_private_cfm.h | 242 +
 5 files changed, 1215 insertions(+)
 create mode 100644 include/uapi/linux/cfm_bridge.h
 create mode 100644 net/bridge/br_cfm.c
 create mode 100644 net/bridge/br_private_cfm.h

diff --git a/include/uapi/linux/cfm_bridge.h b/include/uapi/linux/cfm_bridge.h
new file mode 100644
index ..389ea1e1f68e
--- /dev/null
+++ b/include/uapi/linux/cfm_bridge.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_CFM_BRIDGE_H_
+#define _UAPI_LINUX_CFM_BRIDGE_H_
+
+#include 
+#include 
+
+#define ETHER_HEADER_LENGTH(6+6+4+2)
+#define CFM_MAID_LENGTH48
+#define CFM_CCM_PDU_LENGTH 75
+#define CFM_PORT_STATUS_TLV_LENGTH 4
+#define CFM_IF_STATUS_TLV_LENGTH   4
+#define CFM_IF_STATUS_TLV_TYPE 4
+#define CFM_PORT_STATUS_TLV_TYPE   2
+#define CFM_ENDE_TLV_TYPE  0
+#define CFM_CCM_MAX_FRAME_LENGTH   (ETHER_HEADER_LENGTH+\
+CFM_CCM_PDU_LENGTH+\
+CFM_PORT_STATUS_TLV_LENGTH+\
+CFM_IF_STATUS_TLV_LENGTH)
+#define CFM_FRAME_PRIO 7
+#define CFM_CCM_OPCODE 1
+#define CFM_CCM_TLV_OFFSET 70
+#define CFM_CCM_PDU_MAID_OFFSET10
+#define CFM_CCM_PDU_MEPID_OFFSET   8
+#define CFM_CCM_PDU_SEQNR_OFFSET   4
+#define CFM_CCM_PDU_TLV_OFFSET 74
+#define CFM_CCM_ITU_RESERVED_SIZE  16
+
+struct br_cfm_common_hdr {
+   __u8 mdlevel_version;
+   __u8 opcode;
+   __u8 flags;
+   __u8 tlv_offset;
+};
+
+struct br_cfm_status_tlv {
+   __u8 type;
+   __be16 length;
+   __u8 value;
+};
+
+enum br_cfm_opcodes {
+   BR_CFM_OPCODE_CCM = 0x1,
+   BR_CFM_OPCODE_LBR = 0x2,
+   BR_CFM_OPCODE_LBM = 0x3,
+   BR_CFM_OPCODE_LTR = 0x4,
+   BR_CFM_OPC

[PATCH RFC 7/7] bridge: cfm: Bridge port remove.

2020-09-04 Thread Henrik Bjoernlund
This is addition of CFM functionality to delete MEP instances
on a port that is removed from the bridge.
A MEP can only exist on a port that is related to a bridge.

Signed-off-by: Henrik Bjoernlund  
---
 net/bridge/br_cfm.c | 13 +
 net/bridge/br_if.c  |  1 +
 net/bridge/br_private.h |  6 ++
 3 files changed, 20 insertions(+)

diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
index b7fed2c1d8ec..c724ce020ce3 100644
--- a/net/bridge/br_cfm.c
+++ b/net/bridge/br_cfm.c
@@ -921,3 +921,16 @@ bool br_cfm_created(struct net_bridge *br)
 {
return !list_empty(&br->mep_list);
 }
+
+/* Deletes the CFM instances on a specific bridge port
+ * note: called under rtnl_lock
+ */
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port)
+{
+   struct br_cfm_mep *mep;
+
+   list_for_each_entry_rcu(mep, &br->mep_list, head,
+   lockdep_rtnl_is_held())
+   if (mep->create.ifindex == port->dev->ifindex)
+   mep_delete_implementation(br, mep);
+}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index a0e9a7937412..f7d2f472ae24 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -334,6 +334,7 @@ static void del_nbp(struct net_bridge_port *p)
spin_unlock_bh(&br->lock);
 
br_mrp_port_del(br, p);
+   br_cfm_port_del(br, p);
 
br_ifinfo_notify(RTM_DELLINK, NULL, p);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 53bcbdd21f34..5617255f0c0c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1369,6 +1369,7 @@ int br_cfm_parse(struct net_bridge *br, struct 
net_bridge_port *p,
 struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
 int br_cfm_rx_frame_process(struct net_bridge_port *p, struct sk_buff *skb);
 bool br_cfm_created(struct net_bridge *br);
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p);
 int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br);
 int br_cfm_status_fill_info(struct sk_buff *skb,
struct net_bridge *br,
@@ -1393,6 +1394,11 @@ static inline bool br_cfm_created(struct net_bridge *br)
return false;
 }
 
+static inline void br_cfm_port_del(struct net_bridge *br,
+  struct net_bridge_port *p)
+{
+}
+
 static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct 
net_bridge *br)
 {
return -EOPNOTSUPP;
-- 
2.28.0



[PATCH RFC 6/7] bridge: cfm: Netlink Notifications.

2020-09-04 Thread Henrik Bjoernlund
This is the implementation of Netlink notifications out of CFM.

Notifications are initiated whenever a state change happens in CFM.

IFLA_BRIDGE_CFM:
Points to the CFM information.

IFLA_BRIDGE_CFM_MEP_STATUS_INFO:
This indicate that the MEP instance status are following.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO:
This indicate that the peer MEP status are following.

CFM nested attribute has the following attributes in next level.

IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE:
The MEP instance number of the delivered status.
The type is NLA_U32.
IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN:
The MEP instance received CFM PDU with unexpected Opcode.
The type is NLA_U32 (bool).
IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN:
The MEP instance received CFM PDU with unexpected version.
The type is NLA_U32 (bool).
IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN:
The MEP instance received CCM PDU with MD level lower than
configured level. This frame is discarded.
The type is NLA_U32 (bool).

IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE:
The MEP instance number of the delivered status.
The type is NLA_U32.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID:
The added Peer MEP ID of the delivered status.
The type is NLA_U32.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT:
The CCM defect status.
The type is NLA_U32 (bool).
True means no CCM frame is received for 3.25 intervals.
IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI:
The last received CCM PDU RDI.
The type is NLA_U32 (bool).
IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE:
The last received CCM PDU Port Status TLV value field.
The type is NLA_U8.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE:
The last received CCM PDU Interface Status TLV value field.
The type is NLA_U8.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN:
A CCM frame has been received from Peer MEP.
The type is NLA_U32 (bool).
This is cleared after GETLINK IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN:
A CCM frame with TLV has been received from Peer MEP.
The type is NLA_U32 (bool).
This is cleared after GETLINK IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN:
A CCM frame with unexpected sequence number has been received
from Peer MEP.
The type is NLA_U32 (bool).
When a sequence number is not one higher than previously received
then it is unexpected.
This is cleared after GETLINK IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO.

Signed-off-by: Henrik Bjoernlund  
---
 net/bridge/br_cfm.c | 43 +
 net/bridge/br_cfm_netlink.c | 24 +++-
 net/bridge/br_netlink.c | 76 -
 net/bridge/br_private.h | 22 ++-
 4 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
index e38cc3e8f262..b7fed2c1d8ec 100644
--- a/net/bridge/br_cfm.c
+++ b/net/bridge/br_cfm.c
@@ -155,6 +155,13 @@ static void ccm_rx_timer_start(struct br_cfm_peer_mep 
*peer_mep)
   usecs_to_jiffies(interval_us / 4));
 }
 
+static void br_cfm_notify(int event, const struct net_bridge_port *port)
+{
+   u32 filter = RTEXT_FILTER_CFM_STATUS;
+
+   return br_info_notify(event, port->br, NULL, filter);
+}
+
 static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep)
 {
memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status));
@@ -299,6 +306,7 @@ static void ccm_tx_work_expired(struct work_struct *work)
 static void ccm_rx_work_expired(struct work_struct *work)
 {
struct br_cfm_peer_mep *peer_mep;
+   struct net_bridge_port *b_port;
struct delayed_work *del_work;
 
del_work = to_delayed_work(work);
@@ -318,6 +326,11 @@ static void ccm_rx_work_expired(struct work_struct *work)
peer_mep->cc_status.ccm_defect = true;
 
/* Change in CCM defect status - notify */
+   rcu_read_lock();
+   b_port = rcu_dereference(peer_mep->mep->b_port);
+   if (b_port)
+   br_cfm_notify(RTM_NEWLINK, b_port);
+   rcu_read_unlock();
}
 }
 
@@ -445,6 +458,7 @@ static int br_cfm_frame_rx(struct net_bridge_port *port, 
struct sk_buff *skb)
peer_mep->cc_status.ccm_defect = false;
 
/* Change in CCM defect status - notify */
+   br_cfm_notify(RTM_NEWLINK, port);
 
/* Start CCM RX timer */
ccm_rx_timer_start(peer_mep);
@@ -874,6 +888,35 @@ int br_cfm_cc_counters_clear(struct net_bridge *br, const 
u32 instance,
return 0;
 }
 
+int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+   struct br_cfm_mep *mep;
+   *count = 0;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(mep, &br->mep_list, head)
+   * count += 1;
+   rcu_read_

[PATCH RFC 5/7] bridge: cfm: Netlink Interface.

2020-09-04 Thread Henrik Bjoernlund
This is the implementation of CFM netlink configuration
and status information interface.

Add new nested netlink attributes. These attributes are used by the
user space to create/delete/configure CFM instances and get status.
Also they are used by the kernel to notify the user space when changes
in any status happens.

SETLINK:
IFLA_BRIDGE_CFM:
Indicate that the following attributes are CFM.

IFLA_BRIDGE_CFM_MEP_CREATE:
This indicate that a MEP instance must be created.
IFLA_BRIDGE_CFM_MEP_DELETE:
This indicate that a MEP instance must be deleted.
IFLA_BRIDGE_CFM_MEP_CONFIG:
This indicate that a MEP instance must be configured.
IFLA_BRIDGE_CFM_CC_CONFIG:
This indicate that a MEP instance Continuity Check (CC)
functionality must be configured.
IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD:
This indicate that a CC Peer MEP must be added.
IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE:
This indicate that a CC Peer MEP must be removed.
IFLA_BRIDGE_CFM_CC_CCM_TX:
This indicate that the CC transmitted CCM PDU must be configured.
IFLA_BRIDGE_CFM_CC_RDI:
This indicate that the CC transmitted CCM PDU RDI must be
configured.

GETLINK:
Request filter RTEXT_FILTER_CFM_CONFIG:
Indicating that CFM configuration information must be delivered.

IFLA_BRIDGE_CFM:
Points to the CFM information.

IFLA_BRIDGE_CFM_MEP_CREATE_INFO:
This indicate that MEP instance create parameters are following.
IFLA_BRIDGE_CFM_MEP_CONFIG_INFO:
This indicate that MEP instance config parameters are following.
IFLA_BRIDGE_CFM_CC_CONFIG_INFO:
This indicate that MEP instance CC functionality
parameters are following.
IFLA_BRIDGE_CFM_CC_RDI_INFO:
This indicate that CC transmitted CCM PDU RDI
parameters are following.
IFLA_BRIDGE_CFM_CC_CCM_TX_INFO:
This indicate that CC transmitted CCM PDU parameters are
following.
IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO:
This indicate that the added peer MEP IDs are following.

Request filter RTEXT_FILTER_CFM_STATUS:
Indicating that CFM status information must be delivered.

IFLA_BRIDGE_CFM:
Points to the CFM information.

IFLA_BRIDGE_CFM_MEP_STATUS_INFO:
This indicate that the MEP instance status are following.
IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO:
This indicate that the peer MEP status are following.

CFM nested attribute has the following attributes in next level.

SETLINK and GETLINK RTEXT_FILTER_CFM_CONFIG:
IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE:
The created MEP instance number.
The type is u32.
IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN:
The created MEP domain.
The type is u32 (br_cfm_domain).
It must be BR_CFM_PORT.
This means that CFM frames are transmitted and received
directly on the port - untagged. Not in a VLAN.
IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION:
The created MEP direction.
The type is u32 (br_cfm_mep_direction).
It must be BR_CFM_MEP_DIRECTION_DOWN.
This means that CFM frames are transmitted and received on
the port. Not in the bridge.
IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX:
The created MEP residence port ifindex.
The type is u32 (ifindex).

IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE:
The deleted MEP instance number.
The type is u32.

IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE:
The configured MEP instance number.
The type is u32.
IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC:
The configured MEP unicast MAC address.
The type is 6*u8 (array).
This is used as SMAC in all transmitted CFM frames.
IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL:
The configured MEP unicast MD level.
The type is u32.
It must be in the range 1-7.
No CFM frames are passing through this MEP on lower levels.
IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID:
The configured MEP ID.
The type is u32.
It must be in the range 0-0x1FFF.
This MEP ID is inserted in any transmitted CCM frame.

IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE:
The configured MEP instance number.
The type is u32.
IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE:
The Continuity Check (CC) functionality is enabled or disabled.
The type is u32 (bool).
IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL:
The CC expected receive interval of CCM frames.
The type is u32 (br_cfm_ccm_interval).
This is also the transmission interval of CCM frames when enabled.
IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID:
The CC expected receive MAID in CCM frames.
The type is CFM_MAID_LENGTH*u8.
This is MAID is also inserted in transmitted CCM frames.

IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE:
The configured MEP instance number.
The type is u32.
IFLA_BRIDGE_CFM_CC_PEER_MEPID:

Re: [PATCH net-next] net/packet: Remove unused macro BLOCK_PRIV

2020-09-04 Thread Willem de Bruijn
On Fri, Sep 4, 2020 at 3:09 PM Wang Hai  wrote:
>
> BPDU_TYPE_TCN is never used after it was introduced.
> So better to remove it.

This comment does not cover the patch contents. Otherwise the patch
looks good to me.

> Reported-by: Hulk Robot 
> Signed-off-by: Wang Hai 
> ---
>  net/packet/af_packet.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index da8254e680f9..c430672c6a67 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -177,7 +177,6 @@ static int packet_set_ring(struct sock *sk, union 
> tpacket_req_u *req_u,
>  #define BLOCK_LEN(x)   ((x)->hdr.bh1.blk_len)
>  #define BLOCK_SNUM(x)  ((x)->hdr.bh1.seq_num)
>  #define BLOCK_O2PRIV(x)((x)->offset_to_priv)
> -#define BLOCK_PRIV(x)  ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
>
>  struct packet_sock;
>  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
> --
> 2.17.1
>


[PATCH] net/packet: fix overflow in tpacket_rcv

2020-09-04 Thread Stefan Nuernberger
From: Or Cohen 

Using tp_reserve to calculate netoff can overflow as
tp_reserve is unsigned int and netoff is unsigned short.

This may lead to macoff receving a smaller value then
sizeof(struct virtio_net_hdr), and if po->has_vnet_hdr
is set, an out-of-bounds write will occur when
calling virtio_net_hdr_from_skb.

The bug is fixed by converting netoff to unsigned int
and checking if it exceeds USHRT_MAX.

This addresses CVE-2020-14386

Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt")
Signed-off-by: Or Cohen 
Signed-off-by: Eric Dumazet 

[ snu: backported to 4.9, changed tp_drops counting/locking ]

Signed-off-by: Stefan Nuernberger 
CC: David Woodhouse 
CC: Amit Shah 
CC: sta...@vger.kernel.org
---
 net/packet/af_packet.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fb643945e424..b5b79f501541 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2161,7 +2161,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
int skb_len = skb->len;
unsigned int snaplen, res;
unsigned long status = TP_STATUS_USER;
-   unsigned short macoff, netoff, hdrlen;
+   unsigned short macoff, hdrlen;
+   unsigned int netoff;
struct sk_buff *copy_skb = NULL;
struct timespec ts;
__u32 ts_status;
@@ -2223,6 +2224,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
}
macoff = netoff - maclen;
}
+   if (netoff > USHRT_MAX) {
+   spin_lock(&sk->sk_receive_queue.lock);
+   po->stats.stats1.tp_drops++;
+   spin_unlock(&sk->sk_receive_queue.lock);
+   goto drop_n_restore;
+   }
if (po->tp_version <= TPACKET_V2) {
if (macoff + snaplen > po->rx_ring.frame_size) {
if (po->copy_thresh &&
-- 
2.28.0




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879





Re: [PATCH net-next 0/3] net: phy: Support enabling clocks prior to bus probe

2020-09-04 Thread Andrew Lunn
> Just a bunch of questions.
> 
> Actually, why is it necessary to have a full MDIO bus scan already during
> probing peripherals?

That is the Linux bus model. It does not matter what sort of bus it
is, PCI, USB, MDIO, etc. When the bus driver is loaded, the bus is
enumerated and drivers probe for each device found on the bus.

> I'd say that it is not necessary to have a PHY getting found before it is
> needed to setup the complete interface.

It is like saying, we don't need to probe the keyboard until the first
time the "Press Enter" prompt is given?

 Andrew


Re: [PATCH v2 net-next 6/9] bpf: helpers: add bpf_xdp_adjust_mb_header helper

2020-09-04 Thread Jesper Dangaard Brouer
On Fri, 4 Sep 2020 09:50:31 +0200
Lorenzo Bianconi  wrote:

> > On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:  
> > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > +int, offset)
> > > +{
> > > + void *data_hard_end, *data_end;
> > > + struct skb_shared_info *sinfo;
> > > + int frag_offset, frag_len;
> > > + u8 *addr;
> > > +
> > > + if (!xdp->mb)
> > > + return -EOPNOTSUPP;
> > > +
> > > + sinfo = xdp_get_shared_info_from_buff(xdp);
> > > +
> > > + frag_len = skb_frag_size(&sinfo->frags[0]);
> > > + if (offset > frag_len)
> > > + return -EINVAL;
> > > +
> > > + frag_offset = skb_frag_off(&sinfo->frags[0]);
> > > + data_end = xdp->data_end + offset;
> > > +
> > > + if (offset < 0 && (-offset > frag_offset ||
> > > +data_end < xdp->data + ETH_HLEN))
> > > + return -EINVAL;
> > > +
> > > + data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > > + if (data_end > data_hard_end)
> > > + return -EINVAL;
> > > +
> > > + addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > > + if (offset > 0) {
> > > + memcpy(xdp->data_end, addr, offset);
> > > + } else {
> > > + memcpy(addr + offset, xdp->data_end + offset, -offset);
> > > + memset(xdp->data_end + offset, 0, -offset);
> > > + }
> > > +
> > > + skb_frag_size_sub(&sinfo->frags[0], offset);
> > > + skb_frag_off_add(&sinfo->frags[0], offset);
> > > + xdp->data_end = data_end;
> > > +
> > > + return 0;
> > > +}  
> > 
> > wait a sec. Are you saying that multi buffer XDP actually should be skb 
> > based?
> > If that's what mvneta driver is doing that's fine, but that is not a
> > reasonable requirement to put on all other drivers.  
> 
> I did not got what you mean here. The xdp multi-buffer layout uses
> the skb_shared_info at the end of the first buffer to link subsequent
> frames [0] and we rely on skb_frag* utilities to set/read offset and
> length of subsequent buffers.

Yes, for now the same layout as "skb_shared_info" is "reuse", but I
think we should think of this as "xdp_shared_info" instead, as how it
is used for XDP is going to divert from SKBs.  We already discussed (in
conf call) that we could store the total len of "frags" here, to
simplify the other helper.

Using the skb_frag_* helper functions are misleading, and will make it
more difficult to divert from how SKB handle frags.  What about
introducing xdp_frag_* wrappers? (what do others think?)


> 
> [0] 
> http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html
> - XDP multi-buffers section (slide 40)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer



Re: [PATCH v2] net: dsa: microchip: look for phy-mode in port nodes

2020-09-04 Thread Andrew Lunn
> + dev_warn(dev->dev,
> +  "Using legacy switch \"phy-mode\" missing on 
> port %d node. Please update your device tree.\n",

That message seems mangled.

> + if (!p->interface) {
> + if (dev->compat_interface) {
> + dev_warn(dev->dev,
> +  "Using legacy switch 
> \"phy-mode\" missing on port %d node. Please update your device tree.\n",
> +  i);

Same warning again.

 Andrew


Re: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2 CPT engine

2020-09-04 Thread Herbert Xu
On Fri, Sep 04, 2020 at 01:45:38PM +, Srujana Challa wrote:
>
> This block of code is used for LMT store operations. The LMT store operation
> is specific to our platform, and this uses the "ldeor" instruction(which is
> actually an LSE atomic instruction available on v8.1 CPUs) targeting the
> IO address.
> We add it in the driver since we want LMT store to work even if LSE_ATOMICS
> is disabled.

You have exactly the same macro in your net driver.  Move it into
a header file in arch/arm64/include/asm and also add one under
include/asm-generic so we can compile-test.

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH bpf-next 5/6] ice, xsk: finish napi loop if AF_XDP Rx queue is full

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

Make the AF_XDP zero-copy path aware that the reason for redirect
failure was due to full Rx queue. If so, exit the napi loop as soon as
possible (exit the softirq processing), so that the userspace AF_XDP
process can hopefully empty the Rx queue. This mainly helps the "one
core scenario", where the userland process and Rx softirq processing
is on the same core.

Note that the early exit can only be performed if the "need wakeup"
feature is enabled, because otherwise there is no notification
mechanism available from the kernel side.

This requires that the driver starts using the newly introduced
xdp_do_redirect_ext() and xsk_do_redirect_rx_full() functions.

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/ice/ice_xsk.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c 
b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 797886524054..f698d0199b0a 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -502,13 +502,15 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct 
ice_rx_buf *rx_buf)
  * ice_run_xdp_zc - Executes an XDP program in zero-copy path
  * @rx_ring: Rx ring
  * @xdp: xdp_buff used as input to the XDP program
+ * @early_exit: true means that the napi loop should exit early
  *
  * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
  */
 static int
-ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
+ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp, bool 
*early_exit)
 {
int err, result = ICE_XDP_PASS;
+   enum bpf_map_type map_type;
struct bpf_prog *xdp_prog;
struct ice_ring *xdp_ring;
u32 act;
@@ -529,8 +531,13 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff 
*xdp)
result = ice_xmit_xdp_buff(xdp, xdp_ring);
break;
case XDP_REDIRECT:
-   err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-   result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+   err = xdp_do_redirect_ext(rx_ring->netdev, xdp, xdp_prog, 
&map_type);
+   if (err) {
+   *early_exit = xsk_do_redirect_rx_full(err, map_type);
+   result = ICE_XDP_CONSUMED;
+   } else {
+   result = ICE_XDP_REDIR;
+   }
break;
default:
bpf_warn_invalid_xdp_action(act);
@@ -558,8 +565,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int 
budget)
 {
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+   bool early_exit = false, failure = false;
unsigned int xdp_xmit = 0;
-   bool failure = false;
 
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
@@ -597,7 +604,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int 
budget)
rx_buf->xdp->data_end = rx_buf->xdp->data + size;
xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
 
-   xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
+   xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp, &early_exit);
if (xdp_res) {
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
xdp_xmit |= xdp_res;
@@ -610,6 +617,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int 
budget)
cleaned_count++;
 
ice_bump_ntc(rx_ring);
+   if (early_exit)
+   break;
continue;
}
 
@@ -646,12 +655,12 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int 
budget)
ice_update_rx_ring_stats(rx_ring, total_rx_packets, total_rx_bytes);
 
if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
-   if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+   if (early_exit || failure || rx_ring->next_to_clean == 
rx_ring->next_to_use)
xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
else
xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
 
-   return (int)total_rx_packets;
+   return early_exit ? 0 : (int)total_rx_packets;
}
 
return failure ? budget : (int)total_rx_packets;
-- 
2.25.1



[PATCH bpf-next 1/6] xsk: improve xdp_do_redirect() error codes

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

The error codes returned by xdp_do_redirect() when redirecting a frame
to an AF_XDP socket has not been very useful. A driver could not
distinguish between different errors. Prior this change the following
codes where used:

Socket not bound or incorrect queue/netdev: EINVAL
XDP frame/AF_XDP buffer size mismatch: ENOSPC
Could not allocate buffer (copy mode): ENOSPC
AF_XDP Rx buffer full: ENOSPC

After this change:

Socket not bound or incorrect queue/netdev: EINVAL
XDP frame/AF_XDP buffer size mismatch: ENOSPC
Could not allocate buffer (copy mode): ENOMEM
AF_XDP Rx buffer full: ENOBUFS

An AF_XDP zero-copy driver can now potentially determine if the
failure was due to a full Rx buffer, and if so stop processing more
frames, yielding to the userland AF_XDP application.

Signed-off-by: Björn Töpel 
---
 net/xdp/xsk.c   | 2 +-
 net/xdp/xsk_queue.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 3895697f8540..db38560c4af7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -197,7 +197,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff 
*xdp, u32 len,
xsk_xdp = xsk_buff_alloc(xs->pool);
if (!xsk_xdp) {
xs->rx_dropped++;
-   return -ENOSPC;
+   return -ENOMEM;
}
 
xsk_copy_xdp(xsk_xdp, xdp, len);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 2d883f631c85..b76966cf122e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -305,7 +305,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue 
*q,
u32 idx;
 
if (xskq_prod_is_full(q))
-   return -ENOSPC;
+   return -ENOBUFS;
 
/* A, matches D */
idx = q->cached_prod++ & q->ring_mask;
-- 
2.25.1



[PATCH bpf-next 6/6] ixgbe, xsk: finish napi loop if AF_XDP Rx queue is full

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

Make the AF_XDP zero-copy path aware that the reason for redirect
failure was due to full Rx queue. If so, exit the napi loop as soon as
possible (exit the softirq processing), so that the userspace AF_XDP
process can hopefully empty the Rx queue. This mainly helps the "one
core scenario", where the userland process and Rx softirq processing
is on the same core.

Note that the early exit can only be performed if the "need wakeup"
feature is enabled, because otherwise there is no notification
mechanism available from the kernel side.

This requires that the driver starts using the newly introduced
xdp_do_redirect_ext() and xsk_do_redirect_rx_full() functions.

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 23 ++--
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 3771857cf887..a4aebfd986b3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -93,9 +93,11 @@ int ixgbe_xsk_pool_setup(struct ixgbe_adapter *adapter,
 
 static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
-   struct xdp_buff *xdp)
+   struct xdp_buff *xdp,
+   bool *early_exit)
 {
int err, result = IXGBE_XDP_PASS;
+   enum bpf_map_type map_type;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u32 act;
@@ -116,8 +118,13 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
result = ixgbe_xmit_xdp_ring(adapter, xdpf);
break;
case XDP_REDIRECT:
-   err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-   result = !err ? IXGBE_XDP_REDIR : IXGBE_XDP_CONSUMED;
+   err = xdp_do_redirect_ext(rx_ring->netdev, xdp, xdp_prog, 
&map_type);
+   if (err) {
+   *early_exit = xsk_do_redirect_rx_full(err, map_type);
+   result = IXGBE_XDP_CONSUMED;
+   } else {
+   result = IXGBE_XDP_REDIR;
+   }
break;
default:
bpf_warn_invalid_xdp_action(act);
@@ -235,8 +242,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
+   bool early_exit = false, failure = false;
unsigned int xdp_res, xdp_xmit = 0;
-   bool failure = false;
struct sk_buff *skb;
 
while (likely(total_rx_packets < budget)) {
@@ -288,7 +295,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 
bi->xdp->data_end = bi->xdp->data + size;
xsk_buff_dma_sync_for_cpu(bi->xdp, rx_ring->xsk_pool);
-   xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
+   xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp, 
&early_exit);
 
if (xdp_res) {
if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))
@@ -302,6 +309,8 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 
cleaned_count++;
ixgbe_inc_ntc(rx_ring);
+   if (early_exit)
+   break;
continue;
}
 
@@ -346,12 +355,12 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
q_vector->rx.total_bytes += total_rx_bytes;
 
if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
-   if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+   if (early_exit || failure || rx_ring->next_to_clean == 
rx_ring->next_to_use)
xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
else
xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
 
-   return (int)total_rx_packets;
+   return early_exit ? 0 : (int)total_rx_packets;
}
return failure ? budget : (int)total_rx_packets;
 }
-- 
2.25.1



[PATCH bpf-next 2/6] xdp: introduce xdp_do_redirect_ext() function

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

Introduce the xdp_do_redirect_ext() which returns additional
information to the caller. For now, it is the type of map that the
packet was redirected to.

This enables the driver to have more fine-grained control, e.g. is the
redirect fails due to full AF_XDP Rx queue (error code ENOBUFS and map
is XSKMAP), a zero-copy enabled driver should yield to userland as
soon as possible.

Signed-off-by: Björn Töpel 
---
 include/linux/filter.h |  2 ++
 net/core/filter.c  | 16 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 995625950cc1..0060c2c8abc3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -942,6 +942,8 @@ static inline int xdp_ok_fwd_dev(const struct net_device 
*fwd,
  */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *prog);
+int xdp_do_redirect_ext(struct net_device *dev, struct xdp_buff *xdp,
+   struct bpf_prog *xdp_prog, enum bpf_map_type *map_type);
 int xdp_do_redirect(struct net_device *dev,
struct xdp_buff *xdp,
struct bpf_prog *prog);
diff --git a/net/core/filter.c b/net/core/filter.c
index 47eef9a0be6a..ce6098210a23 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3596,8 +3596,8 @@ void bpf_clear_redirect_map(struct bpf_map *map)
}
 }
 
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
-   struct bpf_prog *xdp_prog)
+int xdp_do_redirect_ext(struct net_device *dev, struct xdp_buff *xdp,
+   struct bpf_prog *xdp_prog, enum bpf_map_type *map_type)
 {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
@@ -3609,6 +3609,8 @@ int xdp_do_redirect(struct net_device *dev, struct 
xdp_buff *xdp,
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
 
+   *map_type = BPF_MAP_TYPE_UNSPEC;
+
if (unlikely(!map)) {
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
@@ -3618,6 +3620,7 @@ int xdp_do_redirect(struct net_device *dev, struct 
xdp_buff *xdp,
 
err = dev_xdp_enqueue(fwd, xdp, dev);
} else {
+   *map_type = map->map_type;
err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
}
 
@@ -3630,6 +3633,15 @@ int xdp_do_redirect(struct net_device *dev, struct 
xdp_buff *xdp,
_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
return err;
 }
+EXPORT_SYMBOL_GPL(xdp_do_redirect_ext);
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+   struct bpf_prog *xdp_prog)
+{
+   enum bpf_map_type dummy;
+
+   return xdp_do_redirect_ext(dev, xdp, xdp_prog, &dummy);
+}
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,
-- 
2.25.1



[PATCH bpf-next 4/6] i40e, xsk: finish napi loop if AF_XDP Rx queue is full

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

Make the AF_XDP zero-copy path aware that the reason for redirect
failure was due to full Rx queue. If so, exit the napi loop as soon as
possible (exit the softirq processing), so that the userspace AF_XDP
process can hopefully empty the Rx queue. This mainly helps the "one
core scenario", where the userland process and Rx softirq processing
is on the same core.

Note that the early exit can only be performed if the "need wakeup"
feature is enabled, because otherwise there is no notification
mechanism available from the kernel side.

This requires that the driver starts using the newly introduced
xdp_do_redirect_ext() and xsk_do_redirect_rx_full() functions.

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c | 23 +++---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2a1153d8957b..3ac803ee8d51 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -142,13 +142,15 @@ int i40e_xsk_pool_setup(struct i40e_vsi *vsi, struct 
xsk_buff_pool *pool,
  * i40e_run_xdp_zc - Executes an XDP program on an xdp_buff
  * @rx_ring: Rx ring
  * @xdp: xdp_buff used as input to the XDP program
+ * @early_exit: true means that the napi loop should exit early
  *
  * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR}
  **/
-static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
+static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp, 
bool *early_exit)
 {
int err, result = I40E_XDP_PASS;
struct i40e_ring *xdp_ring;
+   enum bpf_map_type map_type;
struct bpf_prog *xdp_prog;
u32 act;
 
@@ -167,8 +169,13 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, 
struct xdp_buff *xdp)
result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
break;
case XDP_REDIRECT:
-   err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
-   result = !err ? I40E_XDP_REDIR : I40E_XDP_CONSUMED;
+   err = xdp_do_redirect_ext(rx_ring->netdev, xdp, xdp_prog, 
&map_type);
+   if (err) {
+   *early_exit = xsk_do_redirect_rx_full(err, map_type);
+   result = I40E_XDP_CONSUMED;
+   } else {
+   result = I40E_XDP_REDIR;
+   }
break;
default:
bpf_warn_invalid_xdp_action(act);
@@ -268,8 +275,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
 {
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
+   bool early_exit = false, failure = false;
unsigned int xdp_res, xdp_xmit = 0;
-   bool failure = false;
struct sk_buff *skb;
 
while (likely(total_rx_packets < (unsigned int)budget)) {
@@ -316,7 +323,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
(*bi)->data_end = (*bi)->data + size;
xsk_buff_dma_sync_for_cpu(*bi, rx_ring->xsk_pool);
 
-   xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
+   xdp_res = i40e_run_xdp_zc(rx_ring, *bi, &early_exit);
if (xdp_res) {
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
xdp_xmit |= xdp_res;
@@ -329,6 +336,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
 
cleaned_count++;
i40e_inc_ntc(rx_ring);
+   if (early_exit)
+   break;
continue;
}
 
@@ -363,12 +372,12 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int 
budget)
i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
 
if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
-   if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+   if (early_exit || failure || rx_ring->next_to_clean == 
rx_ring->next_to_use)
xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
else
xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
 
-   return (int)total_rx_packets;
+   return early_exit ? 0 : (int)total_rx_packets;
}
return failure ? budget : (int)total_rx_packets;
 }
-- 
2.25.1



[PATCH bpf-next 3/6] xsk: introduce xsk_do_redirect_rx_full() helper

2020-09-04 Thread Björn Töpel
From: Björn Töpel 

The xsk_do_redirect_rx_full() helper can be used to check if a failure
of xdp_do_redirect() was due to the AF_XDP socket had a full Rx ring.

Signed-off-by: Björn Töpel 
---
 include/net/xdp_sock_drv.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 5b1ee8a9976d..34c58b5fbc28 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -116,6 +116,11 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct 
xsk_buff_pool *pool,
xp_dma_sync_for_device(pool, dma, size);
 }
 
+static inline bool xsk_do_redirect_rx_full(int err, enum bpf_map_type map_type)
+{
+   return err == -ENOBUFS && map_type == BPF_MAP_TYPE_XSKMAP;
+}
+
 #else
 
 static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
@@ -235,6 +240,10 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct 
xsk_buff_pool *pool,
 {
 }
 
+static inline bool xsk_do_redirect_rx_full(int err, enum bpf_map_type map_type)
+{
+   return false;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_DRV_H */
-- 
2.25.1



[PATCH bpf-next 0/6] xsk: exit NAPI loop when AF_XDP Rx ring is full

2020-09-04 Thread Björn Töpel
This series addresses a problem that arises when AF_XDP zero-copy is
enabled, and the kernel softirq Rx processing and userland process is
running on the same core.

In contrast to the two-core case, when the userland process/Rx softirq
shares one core, it it very important that the kernel is not doing
unnecessary work, but instead let the userland process run. This has
not been the case.

For the Intel drivers, when the XDP_REDIRECT fails due to a full Rx
ring, the NAPI loop will simply drop the packet and continue
processing the next packet. The XDP_REDIRECT operation will then fail
again, since userland has not been able to empty the full Rx ring.

The fix for this is letting the NAPI loop exit early, if the AF_XDP
socket Rx ring is full.

The outline is as following; The first patch cleans up the error codes
returned by xdp_do_redirect(), so that a driver can figure out when
the Rx ring is full (ENOBUFS). Patch two adds an extended
xdp_do_redirect() variant that returns what kind of map that was used
in the XDP_REDIRECT action. The third patch adds an AF_XDP driver
helper to figure out if the Rx ring was full. Finally, the last three
patches implements the "early exit" support for Intel.

On my machine the "one core scenario Rx drop" performance went from
~65Kpps to 21Mpps. In other words, from "not usable" to
"usable". YMMV.

I prefer to route this series via bpf-next, since it include core
changes, and not only driver changes.


Have a nice weekend!
Björn

Björn Töpel (6):
  xsk: improve xdp_do_redirect() error codes
  xdp: introduce xdp_do_redirect_ext() function
  xsk: introduce xsk_do_redirect_rx_full() helper
  i40e, xsk: finish napi loop if AF_XDP Rx queue is full
  ice, xsk: finish napi loop if AF_XDP Rx queue is full
  ixgbe, xsk: finish napi loop if AF_XDP Rx queue is full

 drivers/net/ethernet/intel/i40e/i40e_xsk.c   | 23 ++--
 drivers/net/ethernet/intel/ice/ice_xsk.c | 23 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 23 ++--
 include/linux/filter.h   |  2 ++
 include/net/xdp_sock_drv.h   |  9 
 net/core/filter.c| 16 --
 net/xdp/xsk.c|  2 +-
 net/xdp/xsk_queue.h  |  2 +-
 8 files changed, 75 insertions(+), 25 deletions(-)


base-commit: 8eb629585d2231e90112148009e2a11b0979ca38
-- 
2.25.1



Re: [PATCH net-next 0/3] net: phy: Support enabling clocks prior to bus probe

2020-09-04 Thread Andrew Lunn
On Thu, Sep 03, 2020 at 09:04:11PM -0700, Florian Fainelli wrote:
> 
> 
> On 9/2/2020 9:39 PM, Florian Fainelli wrote:
> > Hi all,
> > 
> > This patch series takes care of enabling the Ethernet PHY clocks in
> > DT-based systems (we have no way to do it for ACPI, and ACPI would
> > likely keep all of this hardware enabled anyway).
> > 
> > Please test on your respective platforms, mine still seems to have
> > a race condition that I am tracking down as it looks like we are not
> > waiting long enough post clock enable.
> > 
> > The check on the clock reference count is necessary to avoid an
> > artificial bump of the clock reference count and to support the unbind
> > -> bind of the PHY driver. We could solve it in different ways.
> > 
> > Comments and test results welcome!
> 
> Andrew, while we figure out a proper way to support this with the Linux
> device driver model, would you be opposed in a single patch to
> drivers/net/mdio/mdio-bcm-unimac.c which takes care of enabling the PHY's
> clock during bus->reset just for the sake of getting those systems to work,
> and later on we move over to the pre-probe mechanism?
> 
> That would allow me to continue working with upstream kernels on these
> systems without carrying a big pile of patches.

We do have quite a need for the proper solution. I wouldn't want you
dropping the proper solution because you have a hack in place.

Please add a comment: "HORRIBLE TEMPORARY HACK", to give you
motivation to remove it again :-)

   Andrew


Re: [PATCH bpf-next 0/6] xsk: exit NAPI loop when AF_XDP Rx ring is full

2020-09-04 Thread Björn Töpel

On 2020-09-04 15:53, Björn Töpel wrote:
This series addresses a problem that arises when AF_XDP zero-copy is 
enabled, and the kernel softirq Rx processing and userland process

is running on the same core.


[...]




@Maxim I'm not well versed in Mellanox drivers. Would this be relevant 
to mlx5 as well?



Cheers,
Björn


Re: [PATCH] net: fec: Fix PHY init after phy_reset_after_clk_enable()

2020-09-04 Thread Andrew Lunn
On Fri, Sep 04, 2020 at 12:45:44AM +0200, Marek Vasut wrote:
> On 9/4/20 12:08 AM, Andrew Lunn wrote:
> >>> b4 am 20200903043947.3272453-1-f.faine...@gmail.com
> >>
> >> That might be a fix for the long run, but I doubt there's any chance to
> >> backport it all to stable, is there ?
> > 
> > No. For stable we need something simpler.
> 
> Like this patch ?

Yes.

But i would like to see a Tested-By: or similar from Richard
Leitner. Why does the current code work for his system? Does your
change break it?

   Andrew


Re: [PATCH net-next 0/3] net: phy: Support enabling clocks prior to bus probe

2020-09-04 Thread Adam Rudziński

W dniu 2020-09-04 o 15:45, Andrew Lunn pisze:

Just a bunch of questions.

Actually, why is it necessary to have a full MDIO bus scan already during
probing peripherals?

That is the Linux bus model. It does not matter what sort of bus it
is, PCI, USB, MDIO, etc. When the bus driver is loaded, the bus is
enumerated and drivers probe for each device found on the bus.


OK. But is it always expected to find all the devices on the bus in the 
first run? Does the bus model ever allow to just add any more devices? 
Kind of, "hotplug". :)



I'd say that it is not necessary to have a PHY getting found before it is
needed to setup the complete interface.

It is like saying, we don't need to probe the keyboard until the first
time the "Press Enter" prompt is given?


I'm not sure what you mean. It's like saying that we don't need to care 
if we even have the keyboard until we are interested in any interaction 
with it. (This might be reading a key, an autotest, ..., or not using, 
but avoiding a conflict - depends on application.)


Best regards,
Adam



RE: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2 CPT engine

2020-09-04 Thread Srujana Challa


> Subject: Re: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2 CPT
> engine
> 
> On Fri, Aug 07, 2020 at 07:39:19PM +0530, Srujana Challa wrote:
> >
> > +#if defined(CONFIG_ARM64)
> > +static inline long otx2_lmt_flush(void *ioreg)
> > +{
> > +   long result = 0;
> > +
> > +   __asm__ volatile(".cpu  generic+lse\n"
> > +"ldeor xzr, %0, [%1]\n"
> > +: "=r" (result)
> > +: "r" (ioreg) : "memory");
> > +
> > +   return result;
> > +}
> > +
> > +#else
> > +#define otx2_lmt_flush(addr) ({ 0; })
> > +#endif
> 
> This is not acceptable.  Please work out a way with the ARM folks
> to fix this without adding assembly code in a driver.
> 
This block of code is used for LMT store operations. The LMT store operation
is specific to our platform, and this uses the "ldeor" instruction(which is
actually an LSE atomic instruction available on v8.1 CPUs) targeting the
IO address.
We add it in the driver since we want LMT store to work even if LSE_ATOMICS
is disabled.

Thanks,
Srujana

> Thanks,
> --
> Email: Herbert Xu 
> Home Page: https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__gondor.apana.org.au_-
> 7Eherbert_&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=Fj4OoD5hcKFpANhTW
> dwQzjT1Jpf7veC5263T47JVpnc&m=Xm4oQ3dI4peur80298SnMa5gz-
> 1rdAxVE1rwHkmHvc0&s=7S5Z2Mpq-
> th_W_KeJSQIOSo274CMg5UI0Tc9mkUkypg&e=
> PGP Key: https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__gondor.apana.org.au_-
> 7Eherbert_pubkey.txt&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=Fj4OoD5hcK
> FpANhTWdwQzjT1Jpf7veC5263T47JVpnc&m=Xm4oQ3dI4peur80298SnMa5gz-
> 1rdAxVE1rwHkmHvc0&s=yf6R1d7GDuz4Wmq_7Z7GoPuIkewZfs0x8h6xXvf3b2o&e
> =


Re: [PATCH net v6 1/6] net: marvell: prestera: Add driver for Prestera family ASIC devices

2020-09-04 Thread Andrew Lunn
> > > +static int prestera_is_valid_mac_addr(struct prestera_port *port, u8 
> > > *addr)
> > > +{
> > > +   if (!is_valid_ether_addr(addr))
> > > +   return -EADDRNOTAVAIL;
> > > +
> > > +   if (memcmp(port->sw->base_mac, addr, ETH_ALEN - 1))
> > 
> > Why ETH_ALEN - 1?
> > 
> This is the restriction of the port mac address, it must have base mac
> address part at first 5 bytes.

You probably want to put a comment here about that.

And this is particularly user unfriendly. Is this a hardware issue? Or
firmware? Is this likely to change in the future?

  Andrew


Re: [PATCH net-next] net: dsa: bcm_sf2: Ensure that MDIO diversion is used

2020-09-04 Thread Andrew Lunn
On Thu, Sep 03, 2020 at 09:00:13PM -0700, Florian Fainelli wrote:
> 
> 
> On 9/3/2020 3:03 PM, Andrew Lunn wrote:
> > > The firmware provides the Device Tree but here is the relevant section for
> > > you pasted below. The problematic device is a particular revision of the
> > > silicon (D0) which got later fixed (E0) however the Device Tree was 
> > > created
> > > after the fixed platform, not the problematic one. Both revisions of the
> > > silicon are in production.
> > > 
> > > There should have been an internal MDIO bus created for that chip revision
> > > such that we could have correctly parented phy@0 (bcm53125 below) as child
> > > node of the internal MDIO bus, but you have to realize that this was done
> > > back in 2014 when DSA was barely revived as an active subsystem. The
> > > BCM53125 node should have have been converted to an actual switch node at
> > > some point, I use a mdio_boardinfo overlay downstream to support the 
> > > switch
> > > as a proper b53/DSA switch, anyway.
> > 
> > I was expecting something like that. I think this patch needs a
> > comment in the code explaining it is a workaround for a DT blob which
> > cannot be changed. Maybe also make it conditional on the board
> > compatible string?
> 
> It is already targeted at the Broadcom pseudo PHY address (30) which is the
> one that needs diversion, I will update the patch description accordingly
> though.

O.K, looking at the patch, it was not clear to me it was already
restricted. 

Andrew


RE: [EXT] Re: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2 CPT engine

2020-09-04 Thread Srujana Challa
> Subject: [EXT] Re: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2
> CPT engine
> 
> External Email
> 
> --
> On Fri, Sep 04, 2020 at 01:45:38PM +, Srujana Challa wrote:
> >
> > This block of code is used for LMT store operations. The LMT store operation
> > is specific to our platform, and this uses the "ldeor" instruction(which is
> > actually an LSE atomic instruction available on v8.1 CPUs) targeting the
> > IO address.
> > We add it in the driver since we want LMT store to work even if LSE_ATOMICS
> > is disabled.
> 
> You have exactly the same macro in your net driver.  Move it into
> a header file in arch/arm64/include/asm and also add one under
> include/asm-generic so we can compile-test.
>
Since LMT store is our platform specific, it cannot be generalized to all ARM64.

> Thanks,
> --
> Email: Herbert Xu 
> Home Page: https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__gondor.apana.org.au_-
> 7Eherbert_&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=Fj4OoD5hcKFpANhTW
> dwQzjT1Jpf7veC5263T47JVpnc&m=awJiW_TrzVYeiwrZnqhly73OXSVuKm8XrM7oN
> Nd7Iiw&s=XZLFIH9uZTPOhsn-5jAzt7GXzT0iLbCYru55UjgkbqA&e=
> PGP Key: https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__gondor.apana.org.au_-
> 7Eherbert_pubkey.txt&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=Fj4OoD5hcK
> FpANhTWdwQzjT1Jpf7veC5263T47JVpnc&m=awJiW_TrzVYeiwrZnqhly73OXSVuK
> m8XrM7oNNd7Iiw&s=pBxsW4mXz4mOiEMFDfIJfgC1Ngfvm2egNK0ak5oX7ms&e
> =


Re: [PATCH] net/packet: fix overflow in tpacket_rcv

2020-09-04 Thread Greg Kroah-Hartman
On Fri, Sep 04, 2020 at 03:30:52PM +0200, Stefan Nuernberger wrote:
> From: Or Cohen 
> 
> Using tp_reserve to calculate netoff can overflow as
> tp_reserve is unsigned int and netoff is unsigned short.
> 
> This may lead to macoff receving a smaller value then
> sizeof(struct virtio_net_hdr), and if po->has_vnet_hdr
> is set, an out-of-bounds write will occur when
> calling virtio_net_hdr_from_skb.
> 
> The bug is fixed by converting netoff to unsigned int
> and checking if it exceeds USHRT_MAX.
> 
> This addresses CVE-2020-14386
> 
> Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt")
> Signed-off-by: Or Cohen 
> Signed-off-by: Eric Dumazet 
> 
> [ snu: backported to 4.9, changed tp_drops counting/locking ]
> 
> Signed-off-by: Stefan Nuernberger 
> CC: David Woodhouse 
> CC: Amit Shah 
> CC: sta...@vger.kernel.org
> ---
>  net/packet/af_packet.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)

What is the git commit id of this patch in Linus's tree?

thanks,

greg k-h


Re: [EXT] Re: [PATCH v2 2/3] drivers: crypto: add support for OCTEONTX2 CPT engine

2020-09-04 Thread Herbert Xu
On Fri, Sep 04, 2020 at 02:14:34PM +, Srujana Challa wrote:
>
> Since LMT store is our platform specific, it cannot be generalized to all 
> ARM64.

I'm not asking you to generalise it to all of ARM64.  I'm asking
you to move this into a header file under arch/arm64 that can then
be shared by both your crypto driver and your network driver so
you don't duplicate this everywhere.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH net-next v3 0/6] net-next: dsa: mt7530: add support for MT7531

2020-09-04 Thread Landen Chao
This patch series adds support for MT7531.

MT7531 is the next generation of MT7530 which could be found on Mediatek
router platforms such as MT7622 or MT7629.

It is also a 7-ports switch with 5 giga embedded phys, 2 cpu ports, and
the same MAC logic of MT7530. Cpu port 6 only supports SGMII interface.
Cpu port 5 supports either RGMII or SGMII in different HW SKU, but cannot
be muxed to PHY of port 0/4 like mt7530. Due to support for SGMII
interface, pll, and pad setting are different from MT7530.

MT7531 SGMII interface can be configured in following mode:
- 'SGMII AN mode' with in-band negotiation capability
which is compatible with PHY_INTERFACE_MODE_SGMII.
- 'SGMII force mode' without in-band negotiation
which is compatible with 10B/8B encoding of
PHY_INTERFACE_MODE_1000BASEX with fixed full-duplex and fixed pause.
- 2.5 times faster clocked 'SGMII force mode' without in-band negotiation
which is compatible with 10B/8B encoding of
PHY_INTERFACE_MODE_2500BASEX with fixed full-duplex and fixed pause.

v2 -> v3
- Keep the same setup logic of mt7530/mt7621 because these series of
  patches is for adding mt7531 hardware.
- Do not adjust rgmii delay when vendor phy driver presents in order to
  prevent double adjustment by suggestion of Andrew Lunn.
- Remove redundant 'Example 4' from dt-bindings by suggestion of
  Rob Herring.
- Fix typo.

v1 -> v2
- change phylink_validate callback function to support full-duplex
  gigabit only to match hardware capability.
- add description of SGMII interface.
- configure mt7531 cpu port in fastest speed by default.
- parse SGMII control word for in-band negotiation mode.
- configure RGMII delay based on phy.rst.
- Rename the definition in the header file to avoid potential conflicts.
- Add wrapper function for mdio read/write to support both C22 and C45.
- correct fixed-link speed of 2500base-x in dts.
- add MT7531 port mirror setting.

Landen Chao (6):
  net: dsa: mt7530: Refine message in Kconfig
  net: dsa: mt7530: Extend device data ready for adding a new hardware
  dt-bindings: net: dsa: add new MT7531 binding to support MT7531
  net: dsa: mt7530: Add the support of MT7531 switch
  arm64: dts: mt7622: add mt7531 dsa to mt7622-rfb1 board
  arm64: dts: mt7622: add mt7531 dsa to bananapi-bpi-r64 board

 .../devicetree/bindings/net/dsa/mt7530.txt|   10 +-
 .../dts/mediatek/mt7622-bananapi-bpi-r64.dts  |   44 +
 arch/arm64/boot/dts/mediatek/mt7622-rfb1.dts  |   57 +-
 drivers/net/dsa/Kconfig   |6 +-
 drivers/net/dsa/mt7530.c  | 1194 +++--
 drivers/net/dsa/mt7530.h  |  259 +++-
 6 files changed, 1463 insertions(+), 107 deletions(-)

-- 
2.17.1


Re: [PATCHv10 bpf-next 2/5] xdp: add a new helper for dev map multicast support

2020-09-04 Thread Daniel Borkmann

On 9/3/20 12:26 PM, Hangbin Liu wrote:
[...]

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3814fb631d52..8453d477bb22 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -132,6 +132,7 @@ void xdp_warn(const char *msg, const char *func, const int 
line);
  #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)
  
  struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);

+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
  
  static inline

  void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8dda13880957..e897c4a04061 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3576,6 +3576,27 @@ union bpf_attr {
   *the data in *dst*. This is a wrapper of copy_from_user().
   *Return
   *0 on success, or a negative error in case of failure.
+ *
+ * long bpf_redirect_map_multi(struct bpf_map *map, struct bpf_map *ex_map, 
u64 flags)
+ * Description
+ * This is a multicast implementation for XDP redirect. It will
+ * redirect the packet to ALL the interfaces in *map*, but
+ * exclude the interfaces in *ex_map*.
+ *
+ * The frowarding *map* could be either BPF_MAP_TYPE_DEVMAP or


nit: typo


+ * BPF_MAP_TYPE_DEVMAP_HASH. But the *ex_map* must be
+ * BPF_MAP_TYPE_DEVMAP_HASH to get better performance.
+ *
+ * Currently the *flags* only supports *BPF_F_EXCLUDE_INGRESS*,
+ * which additionally excludes the current ingress device.
+ *
+ * See also bpf_redirect_map() as a unicast implementation,
+ * which supports redirecting packet to a specific ifindex
+ * in the map. As both helpers use struct bpf_redirect_info
+ * to store the redirect info, we will use a a NULL tgt_value
+ * to distinguish multicast and unicast redirecting.
+ * Return
+ * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
   */
  #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3727,6 +3748,7 @@ union bpf_attr {
FN(inode_storage_delete),   \
FN(d_path), \
FN(copy_from_user), \
+   FN(redirect_map_multi), \
/* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper

@@ -3898,6 +3920,11 @@ enum bpf_lwt_encap_mode {
BPF_LWT_ENCAP_IP,
  };
  
+/* BPF_FUNC_redirect_map_multi flags. */

+enum {
+   BPF_F_EXCLUDE_INGRESS   = (1ULL << 0),
+};
+
  #define __bpf_md_ptr(type, name)  \
  union {   \
type name;  \
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2b5ca93c17de..04950e96282c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -511,6 +511,130 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct 
xdp_buff *xdp,
return __xdp_enqueue(dev, xdp, dev_rx);
  }
  
+/* Use direct call in fast path instead of map->ops->map_get_next_key() */

+static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+
+   switch (map->map_type) {
+   case BPF_MAP_TYPE_DEVMAP:
+   return dev_map_get_next_key(map, key, next_key);
+   case BPF_MAP_TYPE_DEVMAP_HASH:
+   return dev_map_hash_get_next_key(map, key, next_key);
+   default:
+   break;
+   }
+
+   return -ENOENT;
+}
+
+bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
+   int exclude_ifindex)
+{
+   if (obj->dev->ifindex == exclude_ifindex)
+   return true;
+
+   if (!map)
+   return false;
+
+   return __dev_map_hash_lookup_elem(map, obj->dev->ifindex) != NULL;
+}
+
+static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp, 
struct bpf_map *map,
+  struct bpf_map *ex_map, u32 
*key,
+  u32 *next_key, int 
ex_ifindex)
+{
+   struct bpf_dtab_netdev *obj;
+   struct net_device *dev;
+   u32 *tmp_key = key;
+   int err;
+
+   err = devmap_get_next_key(map, tmp_key, next_key);
+   if (err)
+   return NULL;
+
+   for (;;) {
+   switch (map->map_type) {
+   case BPF_MAP_TYPE_DEVMAP:
+   obj = __dev_map_lookup_elem(map, *next_key);
+   break;
+   case BPF_MAP_TYPE_DEVMAP_HASH:
+   obj = __dev_map_hash_lookup_elem(map, *next_key);
+   break;
+   default:
+   break;
+   }
+
+   if (!obj || dev_in_exclude_map(obj, ex_map, ex_ifindex))
+   goto find_next;
+
+   dev = obj->dev;
+
+   i

[PATCH net-next v3 1/6] net: dsa: mt7530: Refine message in Kconfig

2020-09-04 Thread Landen Chao
Refine message in Kconfig with fixing typo and an explicit MT7621 support.

Signed-off-by: Landen Chao 
Signed-off-by: Sean Wang 
Reviewed-by: Florian Fainelli 
---
 drivers/net/dsa/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 468b3c4273c5..06d68a848774 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -33,12 +33,12 @@ config NET_DSA_LANTIQ_GSWIP
  the xrx200 / VR9 SoC.
 
 config NET_DSA_MT7530
-   tristate "Mediatek MT7530 Ethernet switch support"
+   tristate "MediaTek MT7530 and MT7621 Ethernet switch support"
depends on NET_DSA
select NET_DSA_TAG_MTK
help
- This enables support for the Mediatek MT7530 Ethernet switch
- chip.
+ This enables support for the MediaTek MT7530 and MT7621 Ethernet
+ switch chip.
 
 config NET_DSA_MV88E6060
tristate "Marvell 88E6060 ethernet switch chip support"
-- 
2.17.1


[PATCH net-next v3 3/6] dt-bindings: net: dsa: add new MT7531 binding to support MT7531

2020-09-04 Thread Landen Chao
Add devicetree binding to support the compatible mt7531 switch as used
in the MediaTek MT7531 switch.

Signed-off-by: Sean Wang 
Signed-off-by: Landen Chao 
---
 Documentation/devicetree/bindings/net/dsa/mt7530.txt | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/mt7530.txt 
b/Documentation/devicetree/bindings/net/dsa/mt7530.txt
index c5ed5d25f642..e22cd56d83fe 100644
--- a/Documentation/devicetree/bindings/net/dsa/mt7530.txt
+++ b/Documentation/devicetree/bindings/net/dsa/mt7530.txt
@@ -5,6 +5,7 @@ Required properties:
 
 - compatible: may be compatible = "mediatek,mt7530"
or compatible = "mediatek,mt7621"
+   or compatible = "mediatek,mt7531"
 - #address-cells: Must be 1.
 - #size-cells: Must be 0.
 - mediatek,mcm: Boolean; if defined, indicates that either MT7530 is the part
@@ -32,10 +33,13 @@ Required properties for the child nodes within ports 
container:
 
 - reg: Port address described must be 6 for CPU port and from 0 to 5 for
user ports.
-- phy-mode: String, must be either "trgmii" or "rgmii" for port labeled
-"cpu".
+- phy-mode: String, the follow value would be acceptable for port labeled "cpu"
+   If compatible mediatek,mt7530 or mediatek,mt7621 is set,
+   must be either "trgmii" or "rgmii"
+   If compatible mediatek,mt7531 is set,
+   must be either "sgmii", "1000base-x" or "2500base-x"
 
-Port 5 of the switch is muxed between:
+Port 5 of mt7530 and mt7621 switch is muxed between:
 1. GMAC5: GMAC5 can interface with another external MAC or PHY.
 2. PHY of port 0 or port 4: PHY interfaces with an external MAC like 2nd GMAC
of the SOC. Used in many setups where port 0/4 becomes the WAN port.
-- 
2.17.1


Re: [PATCH] net/packet: fix overflow in tpacket_rcv

2020-09-04 Thread Nuernberger, Stefan
On Fri, 2020-09-04 at 16:16 +0200, Greg Kroah-Hartman wrote:
> On Fri, Sep 04, 2020 at 03:30:52PM +0200, Stefan Nuernberger wrote:
> > 
> > From: Or Cohen 
> > 
> > Using tp_reserve to calculate netoff can overflow as
> > tp_reserve is unsigned int and netoff is unsigned short.
> > 
> > This may lead to macoff receving a smaller value then
> > sizeof(struct virtio_net_hdr), and if po->has_vnet_hdr
> > is set, an out-of-bounds write will occur when
> > calling virtio_net_hdr_from_skb.
> > 
> > The bug is fixed by converting netoff to unsigned int
> > and checking if it exceeds USHRT_MAX.
> > 
> > This addresses CVE-2020-14386
> > 
> > Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt")
> > Signed-off-by: Or Cohen 
> > Signed-off-by: Eric Dumazet 
> > 
> > [ snu: backported to 4.9, changed tp_drops counting/locking ]
> > 
> > Signed-off-by: Stefan Nuernberger 
> > CC: David Woodhouse 
> > CC: Amit Shah 
> > CC: sta...@vger.kernel.org
> > ---
> >  net/packet/af_packet.c | 9 -
> >  1 file changed, 8 insertions(+), 1 deletion(-)
> What is the git commit id of this patch in Linus's tree?
> 

Sorry, this isn't merged on Linus' tree yet. It's a heads up that the
backport isn't straightforward.

Best,
Stefan

> thanks,
> 
> greg k-h



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




[PATCH net-next v3 6/6] arm64: dts: mt7622: add mt7531 dsa to bananapi-bpi-r64 board

2020-09-04 Thread Landen Chao
Add mt7531 dsa to bananapi-bpi-r64 board for 5 giga Ethernet ports support.

Signed-off-by: Landen Chao 
---
 .../dts/mediatek/mt7622-bananapi-bpi-r64.dts  | 44 +++
 1 file changed, 44 insertions(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts 
b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
index d174ad214857..c57b2571165f 100644
--- a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
+++ b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
@@ -143,6 +143,50 @@
mdio: mdio-bus {
#address-cells = <1>;
#size-cells = <0>;
+
+   switch@0 {
+   compatible = "mediatek,mt7531";
+   reg = <0>;
+   reset-gpios = <&pio 54 0>;
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   label = "wan";
+   };
+
+   port@1 {
+   reg = <1>;
+   label = "lan0";
+   };
+
+   port@2 {
+   reg = <2>;
+   label = "lan1";
+   };
+
+   port@3 {
+   reg = <3>;
+   label = "lan2";
+   };
+
+   port@4 {
+   reg = <4>;
+   label = "lan3";
+   };
+
+   port@6 {
+   reg = <6>;
+   label = "cpu";
+   ethernet = <&gmac0>;
+   phy-mode = "2500base-x";
+   };
+   };
+   };
+
};
 };
 
-- 
2.17.1


[PATCH net-next v3 4/6] net: dsa: mt7530: Add the support of MT7531 switch

2020-09-04 Thread Landen Chao
Add new support for MT7531:

MT7531 is the next generation of MT7530. It is also a 7-ports switch with
5 giga embedded phys, 2 cpu ports, and the same MAC logic of MT7530. Cpu
port 6 only supports SGMII interface. Cpu port 5 supports either RGMII
or SGMII in different HW sku, but cannot be muxed to PHY of port 0/4 like
mt7530. Due to SGMII interface support, pll, and pad setting are different
from MT7530. This patch adds different initial setting, and SGMII phylink
handlers of MT7531.

MT7531 SGMII interface can be configured in following mode:
- 'SGMII AN mode' with in-band negotiation capability
which is compatible with PHY_INTERFACE_MODE_SGMII.
- 'SGMII force mode' without in-band negotiation
which is compatible with 10B/8B encoding of
PHY_INTERFACE_MODE_1000BASEX with fixed full-duplex and fixed pause.
- 2.5 times faster clocked 'SGMII force mode' without in-band negotiation
which is compatible with 10B/8B encoding of
PHY_INTERFACE_MODE_2500BASEX with fixed full-duplex and fixed pause.

Signed-off-by: Landen Chao 
Signed-off-by: Sean Wang 
---
 drivers/net/dsa/Kconfig  |   6 +-
 drivers/net/dsa/mt7530.c | 918 ++-
 drivers/net/dsa/mt7530.h | 222 ++
 3 files changed, 1119 insertions(+), 27 deletions(-)

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 06d68a848774..2451f61a38e4 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -33,12 +33,12 @@ config NET_DSA_LANTIQ_GSWIP
  the xrx200 / VR9 SoC.
 
 config NET_DSA_MT7530
-   tristate "MediaTek MT7530 and MT7621 Ethernet switch support"
+   tristate "MediaTek MT753x and MT7621 Ethernet switch support"
depends on NET_DSA
select NET_DSA_TAG_MTK
help
- This enables support for the MediaTek MT7530 and MT7621 Ethernet
- switch chip.
+ This enables support for the MediaTek MT7530, MT7531, and MT7621
+ Ethernet switch chips.
 
 config NET_DSA_MV88E6060
tristate "Marvell 88E6060 ethernet switch chip support"
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 15c934f10ddd..53b713c9efb0 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -233,6 +233,12 @@ mt7530_write(struct mt7530_priv *priv, u32 reg, u32 val)
mutex_unlock(&bus->mdio_lock);
 }
 
+static u32
+_mt7530_unlocked_read(struct mt7530_dummy_poll *p)
+{
+   return mt7530_mii_read(p->priv, p->reg);
+}
+
 static u32
 _mt7530_read(struct mt7530_dummy_poll *p)
 {
@@ -483,6 +489,108 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, 
phy_interface_t interface)
return 0;
 }
 
+static bool mt7531_dual_sgmii_supported(struct mt7530_priv *priv)
+{
+   u32 val;
+
+   val = mt7530_read(priv, MT7531_TOP_SIG_SR);
+
+   return (val & PAD_DUAL_SGMII_EN) != 0;
+}
+
+static int
+mt7531_pad_setup(struct dsa_switch *ds, phy_interface_t interface)
+{
+   struct mt7530_priv *priv = ds->priv;
+   u32 val;
+   u32 top_sig;
+   u32 hwstrap;
+   u32 xtal;
+
+   if (mt7531_dual_sgmii_supported(priv))
+   return 0;
+
+   val = mt7530_read(priv, MT7531_CREV);
+   top_sig = mt7530_read(priv, MT7531_TOP_SIG_SR);
+   hwstrap = mt7530_read(priv, MT7531_HWTRAP);
+   if ((val & CHIP_REV_M) > 0)
+   xtal = (top_sig & PAD_MCM_SMI_EN) ? HWTRAP_XTAL_FSEL_40MHZ :
+   HWTRAP_XTAL_FSEL_25MHZ;
+   else
+   xtal = hwstrap & HWTRAP_XTAL_FSEL_MASK;
+
+   /* Step 1 : Disable MT7531 COREPLL */
+   val = mt7530_read(priv, MT7531_PLLGP_EN);
+   val &= ~EN_COREPLL;
+   mt7530_write(priv, MT7531_PLLGP_EN, val);
+
+   /* Step 2: switch to XTAL output */
+   val = mt7530_read(priv, MT7531_PLLGP_EN);
+   val |= SW_CLKSW;
+   mt7530_write(priv, MT7531_PLLGP_EN, val);
+
+   val = mt7530_read(priv, MT7531_PLLGP_CR0);
+   val &= ~RG_COREPLL_EN;
+   mt7530_write(priv, MT7531_PLLGP_CR0, val);
+
+   /* Step 3: disable PLLGP and enable program PLLGP */
+   val = mt7530_read(priv, MT7531_PLLGP_EN);
+   val |= SW_PLLGP;
+   mt7530_write(priv, MT7531_PLLGP_EN, val);
+
+   /* Step 4: program COREPLL output frequency to 500MHz */
+   val = mt7530_read(priv, MT7531_PLLGP_CR0);
+   val &= ~RG_COREPLL_POSDIV_M;
+   val |= 2 << RG_COREPLL_POSDIV_S;
+   mt7530_write(priv, MT7531_PLLGP_CR0, val);
+   usleep_range(25, 35);
+
+   switch (xtal) {
+   case HWTRAP_XTAL_FSEL_25MHZ:
+   val = mt7530_read(priv, MT7531_PLLGP_CR0);
+   val &= ~RG_COREPLL_SDM_PCW_M;
+   val |= 0x14 << RG_COREPLL_SDM_PCW_S;
+   mt7530_write(priv, MT7531_PLLGP_CR0, val);
+   break;
+   case HWTRAP_XTAL_FSEL_40MHZ:
+   val = mt7530_read(priv, MT7531_PLLGP_CR0);
+   val &= ~RG_COREPLL_SDM_PCW_M;
+   val |= 0x19 << RG_COREPLL_SDM_PCW_S;
+ 

[PATCH net-next v3 2/6] net: dsa: mt7530: Extend device data ready for adding a new hardware

2020-09-04 Thread Landen Chao
Add a structure holding required operations for each device such as device
initialization, PHY port read or write, a checker whether PHY interface is
supported on a certain port, MAC port setup for either bus pad or a
specific PHY interface.

The patch is done for ready adding a new hardware MT7531, and keep the
same setup logic of existing hardware.

Signed-off-by: Landen Chao 
Signed-off-by: Sean Wang 
---
 drivers/net/dsa/mt7530.c | 272 +--
 drivers/net/dsa/mt7530.h |  37 +-
 2 files changed, 240 insertions(+), 69 deletions(-)

diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 238417db26f9..15c934f10ddd 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -372,8 +372,9 @@ mt7530_fdb_write(struct mt7530_priv *priv, u16 vid,
mt7530_write(priv, MT7530_ATA1 + (i * 4), reg[i]);
 }
 
+/* Setup TX circuit including relevant PAD and driving */
 static int
-mt7530_pad_clk_setup(struct dsa_switch *ds, int mode)
+mt7530_pad_clk_setup(struct dsa_switch *ds, phy_interface_t interface)
 {
struct mt7530_priv *priv = ds->priv;
u32 ncpo1, ssc_delta, trgint, i, xtal;
@@ -387,7 +388,7 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, int mode)
return -EINVAL;
}
 
-   switch (mode) {
+   switch (interface) {
case PHY_INTERFACE_MODE_RGMII:
trgint = 0;
/* PLL frequency: 125MHz */
@@ -409,7 +410,8 @@ mt7530_pad_clk_setup(struct dsa_switch *ds, int mode)
}
break;
default:
-   dev_err(priv->dev, "xMII mode %d not supported\n", mode);
+   dev_err(priv->dev, "xMII interface %d not supported\n",
+   interface);
return -EINVAL;
}
 
@@ -1349,12 +1351,11 @@ mt7530_setup(struct dsa_switch *ds)
return 0;
 }
 
-static void mt7530_phylink_mac_config(struct dsa_switch *ds, int port,
- unsigned int mode,
- const struct phylink_link_state *state)
+static bool
+mt7530_phy_mode_supported(struct dsa_switch *ds, int port,
+ const struct phylink_link_state *state)
 {
struct mt7530_priv *priv = ds->priv;
-   u32 mcr_cur, mcr_new;
 
switch (port) {
case 0: /* Internal phy */
@@ -1363,33 +1364,114 @@ static void mt7530_phylink_mac_config(struct 
dsa_switch *ds, int port,
case 3:
case 4:
if (state->interface != PHY_INTERFACE_MODE_GMII)
-   return;
+   goto unsupported;
break;
case 5: /* 2nd cpu port with phy of port 0 or 4 / external phy */
-   if (priv->p5_interface == state->interface)
-   break;
if (!phy_interface_mode_is_rgmii(state->interface) &&
state->interface != PHY_INTERFACE_MODE_MII &&
state->interface != PHY_INTERFACE_MODE_GMII)
-   return;
+   goto unsupported;
+   break;
+   case 6: /* 1st cpu port */
+   if (state->interface != PHY_INTERFACE_MODE_RGMII &&
+   state->interface != PHY_INTERFACE_MODE_TRGMII)
+   goto unsupported;
+   break;
+   default:
+   dev_err(priv->dev, "%s: unsupported port: %i\n", __func__,
+   port);
+   goto unsupported;
+   }
+
+   return true;
+
+unsupported:
+   return false;
+}
+
+static bool
+mt753x_phy_mode_supported(struct dsa_switch *ds, int port,
+ const struct phylink_link_state *state)
+{
+   struct mt7530_priv *priv = ds->priv;
+
+   return priv->info->phy_mode_supported(ds, port, state);
+}
+
+static int
+mt753x_pad_setup(struct dsa_switch *ds, const struct phylink_link_state *state)
+{
+   struct mt7530_priv *priv = ds->priv;
+
+   return priv->info->pad_setup(ds, state->interface);
+}
+
+static int
+mt7530_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
+ phy_interface_t interface)
+{
+   struct mt7530_priv *priv = ds->priv;
+
+   /* Only need to setup port5. */
+   if (port != 5)
+   return 0;
+
+   mt7530_setup_port5(priv->ds, interface);
+
+   return 0;
+}
+
+static int
+mt753x_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
+ const struct phylink_link_state *state)
+{
+   struct mt7530_priv *priv = ds->priv;
+
+   return priv->info->mac_port_config(ds, port, mode, state->interface);
+}
+
+static void
+mt753x_phylink_mac_config(struct dsa_switch *ds, int port, unsigned int mode,
+ const struct phylink_link_state *state)
+{
+   struct mt7530_priv *priv = ds->priv;
+   u32 mcr_cur, mcr_new;
+
+   if (!mt753x_phy_mode_supported(ds, port, state))
+ 

  1   2   3   >