date:20250305

Re: [Intel-wired-lan] [PATCH iwl-next v8 11/11] igc: add support to get frame preemption statistics via ethtool

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:26AM -0500, Faizal Rahim wrote:
> +/* Received out of order packets with SMD-C */
> +#define IGC_PRMEXCPRCNT_OOO_SMDC 0x00FF
> +/* Received out of order packets with SMD-C and wrong Frame CNT */
> +#define IGC_PRMEXCPRCNT_OOO_FRAME_CNT0xFF00
> +/* Received out of order packets with SMD-C and wrong Frag CNT */
> +#define IGC_PRMEXCPRCNT_OOO_FRAG_CNT 0x00FF
> +/* Received packets with SMD-S and wrong Frag CNT and Frame CNT */
> +#define IGC_PRMEXCPRCNT_MISS_FRAME_FRAG_CNT  0xFF00
>  
> +/**
> + * igc_ethtool_get_frame_ass_error - Get the frame assembly error count.
> + * @reg_value: Register value for IGC_PRMEXCPRCNT
> + * Return: The count of frame assembly errors.
> + */
> +static u64 igc_ethtool_get_frame_ass_error(u32 reg_value)
> +{
> + u32 ooo_frame_cnt, ooo_frag_cnt; /* Out of order statistics */
> + u32 miss_frame_frag_cnt;
> +
> + ooo_frame_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAME_CNT, reg_value);
> + ooo_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAG_CNT, reg_value);
> + miss_frame_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_MISS_FRAME_FRAG_CNT, 
> reg_value);
> +
> + return ooo_frame_cnt + ooo_frag_cnt + miss_frame_frag_cnt;
> +}

These counters are quite small (8 bits each). What is their behavior
once they reach 255? Saturate? Truncate? Do they clear on read?

Re: [Intel-wired-lan] [PATCH iwl-next v8 08/11] igc: add support to set tx-min-frag-size

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:23AM -0500, Faizal Rahim wrote:
> Add support to set tx-min-frag-size via set_mm callback in igc.
> Increase the max limit of tx-ming-frag-size in ethtool from 252 to 256
> since i225/6 value range is 64, 128, 192 and 256.
> 
> Co-developed-by: Vinicius Costa Gomes 
> Signed-off-by: Vinicius Costa Gomes 
> Signed-off-by: Faizal Rahim 
> ---
>  drivers/net/ethernet/intel/igc/igc.h |  1 +
>  drivers/net/ethernet/intel/igc/igc_defines.h |  1 +
>  drivers/net/ethernet/intel/igc/igc_ethtool.c |  5 +++
>  drivers/net/ethernet/intel/igc/igc_tsn.c | 37 ++--
>  drivers/net/ethernet/intel/igc/igc_tsn.h |  2 +-
>  net/ethtool/mm.c |  2 +-
>  6 files changed, 43 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc.h 
> b/drivers/net/ethernet/intel/igc/igc.h
> index d9ecb7cf80c9..4dfd133b4d6f 100644
> --- a/drivers/net/ethernet/intel/igc/igc.h
> +++ b/drivers/net/ethernet/intel/igc/igc.h
> @@ -42,6 +42,7 @@ void igc_ethtool_set_ops(struct net_device *);
>  
>  struct igc_fpe_t {
>   struct ethtool_mmsv mmsv;
> + u32 tx_min_frag_size;
>  };
>  
>  enum igc_mac_filter_type {
> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
> b/drivers/net/ethernet/intel/igc/igc_defines.h
> index 22db1de02964..038ee89f1e08 100644
> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
> @@ -551,6 +551,7 @@
>  #define IGC_TQAVCTRL_PREEMPT_ENA 0x0002
>  #define IGC_TQAVCTRL_ENHANCED_QAV0x0008
>  #define IGC_TQAVCTRL_FUTSCDDIS   0x0080
> +#define IGC_TQAVCTRL_MIN_FRAG_MASK   0xC000
>  
>  #define IGC_TXQCTL_QUEUE_MODE_LAUNCHT0x0001
>  #define IGC_TXQCTL_STRICT_CYCLE  0x0002
> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c 
> b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> index b64d5c6c1d20..529654ccd83f 100644
> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> @@ -1789,6 +1789,11 @@ static int igc_ethtool_set_mm(struct net_device 
> *netdev,
>   struct igc_adapter *adapter = netdev_priv(netdev);
>   struct igc_fpe_t *fpe = &adapter->fpe;
>  
> + fpe->tx_min_frag_size = 
> igc_fpe_get_supported_frag_size(cmd->tx_min_frag_size);
> + if (fpe->tx_min_frag_size != cmd->tx_min_frag_size)
> + NL_SET_ERR_MSG_MOD(extack,
> +"tx-min-frag-size value set is unsupported. 
> Rounded up to supported value (64, 128, 192, 256)");
> +
>   if (fpe->mmsv.pmac_enabled != cmd->pmac_enabled) {
>   if (cmd->pmac_enabled)
>   static_branch_inc(&igc_fpe_enabled);
> diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c 
> b/drivers/net/ethernet/intel/igc/igc_tsn.c
> index 0a2c747fde2d..2ec5909bf8b0 100644
> --- a/drivers/net/ethernet/intel/igc/igc_tsn.c
> +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c
> @@ -6,6 +6,12 @@
>  #include "igc_hw.h"
>  #include "igc_tsn.h"
>  
> +#define MIN_MULTPLIER_TX_MIN_FRAG0
> +#define MAX_MULTPLIER_TX_MIN_FRAG3
> +/* Frag size is based on the Section 8.12.2 of the SW User Manual */
> +#define TX_MIN_FRAG_SIZE 64
> +#define TX_MAX_FRAG_SIZE (TX_MIN_FRAG_SIZE * (MAX_MULTPLIER_TX_MIN_FRAG 
> + 1))
> +
>  DEFINE_STATIC_KEY_FALSE(igc_fpe_enabled);
>  
>  static int igc_fpe_init_smd_frame(struct igc_ring *ring,
> @@ -128,6 +134,7 @@ static const struct ethtool_mmsv_ops igc_mmsv_ops = {
>  
>  void igc_fpe_init(struct igc_adapter *adapter)
>  {
> + adapter->fpe.tx_min_frag_size = TX_MIN_FRAG_SIZE;
>   ethtool_mmsv_init(&adapter->fpe.mmsv, adapter->netdev, &igc_mmsv_ops);
>  }
>  
> @@ -278,7 +285,7 @@ static int igc_tsn_disable_offload(struct igc_adapter 
> *adapter)
>   tqavctrl = rd32(IGC_TQAVCTRL);
>   tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN |
> IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS |
> -   IGC_TQAVCTRL_PREEMPT_ENA);
> +   IGC_TQAVCTRL_PREEMPT_ENA | IGC_TQAVCTRL_MIN_FRAG_MASK);
>  
>   wr32(IGC_TQAVCTRL, tqavctrl);
>  
> @@ -324,12 +331,34 @@ static void igc_tsn_set_retx_qbvfullthreshold(struct 
> igc_adapter *adapter)
>   wr32(IGC_RETX_CTL, retxctl);
>  }
>  
> +static u8 igc_fpe_get_frag_size_mult(const struct igc_fpe_t *fpe)
> +{
> + u8 mult = (fpe->tx_min_frag_size / TX_MIN_FRAG_SIZE) - 1;
> +
> + return clamp_t(u8, mult, MIN_MULTPLIER_TX_MIN_FRAG,
> +MAX_MULTPLIER_TX_MIN_FRAG);
> +}
> +
> +u32 igc_fpe_get_supported_frag_size(u32 frag_size)
> +{
> + const u32 supported_sizes[] = {64, 128, 192, 256};
> +
> + /* Find the smallest supported size that is >= frag_size */
> + for (int i = 0; i < ARRAY_SIZE(supported_sizes); i++) {
> + if (frag_size <= supported_sizes[i])
> + return supported_sizes[i];
> + }
> +
> + return TX_MAX_FRAG_

[Intel-wired-lan] [PATCH] i40e: fix MMIO write access to an invalid page in i40e_clear_hw

2025-03-05 Thread Kyungwook Boo

In i40e_clear_hw(), when the device sends a specific input(e.g., 0),
an integer underflow in the num_{pf,vf}_int variables can occur,
leading to MMIO write access to an invalid page.

To fix this, we change the type of the unsigned integer variables
num_{pf,vf}_int to signed integers. Additionally, in the for-loop where the
integer underflow occurs, we also change the type of the loop variable i to
a signed integer.

Signed-off-by: Kyungwook Boo 
Signed-off-by: Loktionov, Aleksandr 
Signed-off-by: Przemek Kitszel 
Link: 
https://lore.kernel.org/lkml/ffc91764-1142-4ba2-91b6-8c773f6f7...@gmail.com/T/
---
 drivers/net/ethernet/intel/i40e/i40e_common.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 370b4bddee44..9a73cb94dc5e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -817,8 +817,8 @@ int i40e_pf_reset(struct i40e_hw *hw)
 void i40e_clear_hw(struct i40e_hw *hw)
 {
u32 num_queues, base_queue;
-   u32 num_pf_int;
-   u32 num_vf_int;
+   s32 num_pf_int;
+   s32 num_vf_int;
u32 num_vfs;
u32 i, j;
u32 val;
@@ -848,18 +848,18 @@ void i40e_clear_hw(struct i40e_hw *hw)
/* stop all the interrupts */
wr32(hw, I40E_PFINT_ICR0_ENA, 0);
val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
-   for (i = 0; i < num_pf_int - 2; i++)
+   for (s32 i = 0; i < num_pf_int - 2; i++)
wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
 
/* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */
val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
wr32(hw, I40E_PFINT_LNKLST0, val);
-   for (i = 0; i < num_pf_int - 2; i++)
+   for (s32 i = 0; i < num_pf_int - 2; i++)
wr32(hw, I40E_PFINT_LNKLSTN(i), val);
val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
for (i = 0; i < num_vfs; i++)
wr32(hw, I40E_VPINT_LNKLST0(i), val);
-   for (i = 0; i < num_vf_int - 2; i++)
+   for (s32 i = 0; i < num_vf_int - 2; i++)
wr32(hw, I40E_VPINT_LNKLSTN(i), val);
 
/* warn the HW of the coming Tx disables */
-- 
2.25.1

Re: [Intel-wired-lan] [PATCH] i40e: Disable i40e PCIe AER on system reboot

2025-03-05 Thread Tony Nguyen


On 12/26/2024 7:54 PM, Yue Zhao wrote:

Disable PCIe AER on the i40e device on system reboot on a limited
list of Dell PowerEdge systems. This prevents a fatal PCIe AER event
on the i40e device during the ACPI _PTS (prepare to sleep) method for
S5 on those systems. The _PTS is invoked by acpi_enter_sleep_state_prep()
as part of the kernel's reboot sequence as a result of commit
38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot").


Hi Yue,

We've contacted Dell to try to root cause the issue and find the proper 
fix. It would help if we could provide more information about the 
problem and circumstances. Have you reported the issue to Dell? If so, 
could you provide that to me (here or privately) so that we can pass 
that along to help the investigation?


Thank you,
Tony


We first noticed this abnormal reboot issue in tg3 device, and there
is a similar patch about disable PCIe AER to fix hardware error during
reboot. The hardware error in tg3 device has gone after we apply this
patch below.

https://lore.kernel.org/lkml/20241129203640.54492-1-lszub...@redhat.com/T/

So we try to disable PCIe AER on the i40e device in the similar way.

hardware crash dmesg log:

ACPI: PM: Preparing to enter system sleep state S5
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 5
{1}[Hardware Error]: event severity: fatal
{1}[Hardware Error]:  Error 0, type: fatal
{1}[Hardware Error]:   section_type: PCIe error
{1}[Hardware Error]:   port_type: 0, PCIe end point
{1}[Hardware Error]:   version: 3.0
{1}[Hardware Error]:   command: 0x0006, status: 0x0010
{1}[Hardware Error]:   device_id: :05:00.1
{1}[Hardware Error]:   slot: 0
{1}[Hardware Error]:   secondary_bus: 0x00
{1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x1572
{1}[Hardware Error]:   class_code: 02
{1}[Hardware Error]:   aer_uncor_status: 0x0010, aer_uncor_mask: 0x00018000
{1}[Hardware Error]:   aer_uncor_severity: 0x000ef030
{1}[Hardware Error]:   TLP Header: 4001 000f 90028090 
Kernel panic - not syncing: Fatal hardware error!
Hardware name: Dell Inc. PowerEdge C4140/08Y2GR, BIOS 2.21.1 12/12/2023
Call Trace:
  
  dump_stack_lvl+0x48/0x70
  dump_stack+0x10/0x20
  panic+0x1b4/0x3a0
  __ghes_panic+0x6c/0x70
  ghes_in_nmi_queue_one_entry.constprop.0+0x1ee/0x2c0
  ghes_notify_nmi+0x5e/0xe0
  nmi_handle+0x62/0x160
  default_do_nmi+0x4c/0x150
  exc_nmi+0x140/0x1f0
  end_repeat_nmi+0x16/0x67
RIP: 0010:intel_idle_irq+0x70/0xf0
  
  
  cpuidle_enter_state+0x91/0x6f0
  cpuidle_enter+0x2e/0x50
  call_cpuidle+0x23/0x60
  cpuidle_idle_call+0x11d/0x190
  do_idle+0x82/0xf0
  cpu_startup_entry+0x2a/0x30
  rest_init+0xc2/0xf0
  arch_call_rest_init+0xe/0x30
  start_kernel+0x34f/0x440
  x86_64_start_reservations+0x18/0x30
  x86_64_start_kernel+0xbf/0x110
  secondary_startup_64_no_verify+0x18f/0x19b
  

Fixes: 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot")
Signed-off-by: Yue Zhao 
---
  drivers/net/ethernet/intel/i40e/i40e_main.c | 64 +
  1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0e1d9e2fbf38..80e66e4e90f7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
  
  /* Local includes */

  #include "i40e.h"
@@ -16608,6 +16609,56 @@ static void i40e_pci_error_resume(struct pci_dev *pdev)
i40e_io_resume(pf);
  }
  
+/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal

+ * PCIe AER event on the i40e device if the i40e device is not, or cannot
+ * be, powered down.
+ */
+static const struct dmi_system_id i40e_restart_aer_quirk_table[] = {
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge C4140"),
+   },
+   },
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"),
+   },
+   },
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R540"),
+   },
+   },
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R640"),
+   },
+   },
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R650"),
+   },
+   },
+   {
+   .matches = {
+   DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+   DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R740"),
+   },
+   },
+

Re: [Intel-wired-lan] [PATCH] i40e: Disable i40e PCIe AER on system reboot

2025-03-05 Thread Yue Zhao

Hi Tony,

Our DELL servers are all out of warranty, so I cannot provide more
useful information from the communication with the vendor side.
Is there any possible fix via upgrading firmware or other components?

Thanks,
Best Regards

Yue

On Thu, Mar 6, 2025 at 8:47 AM Tony Nguyen 
wrote:

> On 12/26/2024 7:54 PM, Yue Zhao wrote:
> > Disable PCIe AER on the i40e device on system reboot on a limited
> > list of Dell PowerEdge systems. This prevents a fatal PCIe AER event
> > on the i40e device during the ACPI _PTS (prepare to sleep) method for
> > S5 on those systems. The _PTS is invoked by acpi_enter_sleep_state_prep()
> > as part of the kernel's reboot sequence as a result of commit
> > 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot").
>
> Hi Yue,
>
> We've contacted Dell to try to root cause the issue and find the proper
> fix. It would help if we could provide more information about the
> problem and circumstances. Have you reported the issue to Dell? If so,
> could you provide that to me (here or privately) so that we can pass
> that along to help the investigation?
>
> Thank you,
> Tony
>
> > We first noticed this abnormal reboot issue in tg3 device, and there
> > is a similar patch about disable PCIe AER to fix hardware error during
> > reboot. The hardware error in tg3 device has gone after we apply this
> > patch below.
> >
> >
> https://lore.kernel.org/lkml/20241129203640.54492-1-lszub...@redhat.com/T/
> >
> > So we try to disable PCIe AER on the i40e device in the similar way.
> >
> > hardware crash dmesg log:
> >
> > ACPI: PM: Preparing to enter system sleep state S5
> > {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error
> Source: 5
> > {1}[Hardware Error]: event severity: fatal
> > {1}[Hardware Error]:  Error 0, type: fatal
> > {1}[Hardware Error]:   section_type: PCIe error
> > {1}[Hardware Error]:   port_type: 0, PCIe end point
> > {1}[Hardware Error]:   version: 3.0
> > {1}[Hardware Error]:   command: 0x0006, status: 0x0010
> > {1}[Hardware Error]:   device_id: :05:00.1
> > {1}[Hardware Error]:   slot: 0
> > {1}[Hardware Error]:   secondary_bus: 0x00
> > {1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x1572
> > {1}[Hardware Error]:   class_code: 02
> > {1}[Hardware Error]:   aer_uncor_status: 0x0010, aer_uncor_mask:
> 0x00018000
> > {1}[Hardware Error]:   aer_uncor_severity: 0x000ef030
> > {1}[Hardware Error]:   TLP Header: 4001 000f 90028090 
> > Kernel panic - not syncing: Fatal hardware error!
> > Hardware name: Dell Inc. PowerEdge C4140/08Y2GR, BIOS 2.21.1 12/12/2023
> > Call Trace:
> >   
> >   dump_stack_lvl+0x48/0x70
> >   dump_stack+0x10/0x20
> >   panic+0x1b4/0x3a0
> >   __ghes_panic+0x6c/0x70
> >   ghes_in_nmi_queue_one_entry.constprop.0+0x1ee/0x2c0
> >   ghes_notify_nmi+0x5e/0xe0
> >   nmi_handle+0x62/0x160
> >   default_do_nmi+0x4c/0x150
> >   exc_nmi+0x140/0x1f0
> >   end_repeat_nmi+0x16/0x67
> > RIP: 0010:intel_idle_irq+0x70/0xf0
> >   
> >   
> >   cpuidle_enter_state+0x91/0x6f0
> >   cpuidle_enter+0x2e/0x50
> >   call_cpuidle+0x23/0x60
> >   cpuidle_idle_call+0x11d/0x190
> >   do_idle+0x82/0xf0
> >   cpu_startup_entry+0x2a/0x30
> >   rest_init+0xc2/0xf0
> >   arch_call_rest_init+0xe/0x30
> >   start_kernel+0x34f/0x440
> >   x86_64_start_reservations+0x18/0x30
> >   x86_64_start_kernel+0xbf/0x110
> >   secondary_startup_64_no_verify+0x18f/0x19b
> >   
> >
> > Fixes: 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot")
> > Signed-off-by: Yue Zhao 
> > ---
> >   drivers/net/ethernet/intel/i40e/i40e_main.c | 64 +
> >   1 file changed, 64 insertions(+)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> > index 0e1d9e2fbf38..80e66e4e90f7 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> > @@ -8,6 +8,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >
> >   /* Local includes */
> >   #include "i40e.h"
> > @@ -16608,6 +16609,56 @@ static void i40e_pci_error_resume(struct
> pci_dev *pdev)
> >   i40e_io_resume(pf);
> >   }
> >
> > +/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal
> > + * PCIe AER event on the i40e device if the i40e device is not, or
> cannot
> > + * be, powered down.
> > + */
> > +static const struct dmi_system_id i40e_restart_aer_quirk_table[] = {
> > + {
> > + .matches = {
> > + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
> > + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge C4140"),
> > + },
> > + },
> > + {
> > + .matches = {
> > + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
> > + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"),
> > + },
> > + },
> > + {
> > + .matches = {
> > + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
> > + D

Re: [Intel-wired-lan] [PATCH iwl-next v8 01/11] net: stmmac: move frag_size handling out of spin_lock

2025-03-05 Thread Furong Xu

On Wed,  5 Mar 2025 08:00:16 -0500
Faizal Rahim  wrote:

> The upcoming patch will extract verification logic into a new module,
> MMSV (MAC Merge Software Verification). MMSV will handle most FPE fields,
> except frag_size. It introduces its own lock (mmsv->lock), replacing
> fpe_cfg->lock.
> 
> Since frag_size handling remains in the driver, the existing rtnl_lock()
> is sufficient. Move frag_size handling out of spin_lock_irq_save() to keep
> the upcoming patch a pure refactoring without behavior changes.
> 
> Signed-off-by: Faizal Rahim 
> ---
>  drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
> index 918a32f8fda8..cfe5aea24549 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
> @@ -1216,6 +1216,10 @@ static int stmmac_get_mm(struct net_device *ndev,
>   if (!stmmac_fpe_supported(priv))
>   return -EOPNOTSUPP;
>  
> + state->rx_min_frag_size = ETH_ZLEN;
> + frag_size = stmmac_fpe_get_add_frag_size(priv);
> + state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size);
> +
>   spin_lock_irqsave(&priv->fpe_cfg.lock, flags);
>  
>   state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS;
> @@ -1224,7 +1228,6 @@ static int stmmac_get_mm(struct net_device *ndev,
>   state->verify_time = priv->fpe_cfg.verify_time;
>   state->tx_enabled = priv->fpe_cfg.tx_enabled;
>   state->verify_status = priv->fpe_cfg.status;
> - state->rx_min_frag_size = ETH_ZLEN;
>  
>   /* FPE active if common tx_enabled and
>* (verification success or disabled(forced))
> @@ -1236,9 +1239,6 @@ static int stmmac_get_mm(struct net_device *ndev,
>   else
>   state->tx_active = false;
>  
> - frag_size = stmmac_fpe_get_add_frag_size(priv);
> - state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size);
> -
>   spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags);
>  
>   return 0;
> @@ -1258,6 +1258,8 @@ static int stmmac_set_mm(struct net_device *ndev, 
> struct ethtool_mm_cfg *cfg,
>   if (err)
>   return err;
>  
> + stmmac_fpe_set_add_frag_size(priv, frag_size);
> +
>   /* Wait for the verification that's currently in progress to finish */
>   timer_shutdown_sync(&fpe_cfg->verify_timer);
>  
> @@ -1271,7 +1273,6 @@ static int stmmac_set_mm(struct net_device *ndev, 
> struct ethtool_mm_cfg *cfg,
>   if (!cfg->verify_enabled)
>   fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
>  
> - stmmac_fpe_set_add_frag_size(priv, frag_size);
>   stmmac_fpe_apply(priv);
>  
>   spin_unlock_irqrestore(&fpe_cfg->lock, flags);

Reviewed-by: Furong Xu <0x1...@gmail.com>

Re: [Intel-wired-lan] MMIO write access to an invalid page in i40e_clear_hw()

2025-03-05 Thread Loktionov, Aleksandr



> -Original Message-
> From: Intel-wired-lan  On Behalf Of
> Przemek Kitszel
> Sent: Wednesday, March 5, 2025 11:27 AM
> To: Kyungwook Boo 
> Cc: intel-wired-...@lists.osuosl.org; linux-ker...@vger.kernel.org; Nguyen,
> Anthony L 
> Subject: Re: [Intel-wired-lan] MMIO write access to an invalid page in
> i40e_clear_hw()
> 
> On 3/3/25 11:19, Kyungwook Boo wrote:
> > Hello,
> >
> > It seems that there are invalid page MMIO write access in
> > i40e_clear_hw()
> 
> Hi,
> 
> is this something that actually occurred, or just a theoretical bug?
> (depending on that we will apply it to different tree)
> 
> please send a proper patch anyway, as it looks legit to don't go bananas when
> HW gives you 0
> 
> (and CC netdev instead of generic kernel ML, perhaps that's the reason this
> mail was tagged as spam for me)
> 
> > due to an integer underflow from num_pf_int(also num_vf_int seems
> possible).
> >
> > The following is a sample code in i40e_clear_hw():
> >
> > val = rd32(hw, I40E_GLPCI_CNF2); // (1) num_pf_int =
> > FIELD_GET(I40E_GLPCI_CNF2_MSI_X_PF_N_MASK, val); // (2) num_vf_int =
> > FIELD_GET(I40E_GLPCI_CNF2_MSI_X_VF_N_MASK, val); ...
> > for (i = 0; i < num_pf_int - 2; i++)  // (3)
> > wr32(hw, I40E_PFINT_DYN_CTLN(i), val);  // (4) ...
> > for (i = 0; i < num_pf_int - 2; i++)// (5)
> > wr32(hw, I40E_PFINT_LNKLSTN(i), val); ...
> > for (i = 0; i < num_vf_int - 2; i++)// (6)
> > wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> >
> > An example scenario for num_pf_int:
> > (1) val = 0 (if MMIO read value was 0)
> > (2) num_pf_int = 0 (also zero after bit field extraction from val)
> > (3) An integer underflow occurs (num_pf_int - 2 == 0xfffe)
> > (4) Out-of-bounds MMIO write access if access address exceeds the
> > expected range.
> >
> >  From above example scenario, the maximum access offset value can be
> > around
> > 0x4000347f8(=172G) which seems like this underflow is not
> > intended(also there are masking operations like (2) for num_pf_int), so I
> report this issue.
> >
> > I think similar issue also could happen at (5) and (6).
> >
> > The following is the patch method I propose:
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c
> > b/drivers/net/ethernet/intel/i40e/i40e_common.c
> > index 370b4bddee44..97ef79be39b3 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_common.c
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
> > @@ -848,19 +848,25 @@ void i40e_clear_hw(struct i40e_hw *hw)
> > /* stop all the interrupts */
> > wr32(hw, I40E_PFINT_ICR0_ENA, 0);
> > val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
> > -   for (i = 0; i < num_pf_int - 2; i++)
> > -   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
> > +   if (num_pf_int > 1) {
> 
> instead of adding if conditions, I would simply change the type to be signed
Agree, but don't forget to make I signed too!

> > +   for (i = 0; i < num_pf_int - 2; i++)
> > +   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
> > +   }
> >
> > /* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */
> > val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
> > wr32(hw, I40E_PFINT_LNKLST0, val);
> > -   for (i = 0; i < num_pf_int - 2; i++)
> > -   wr32(hw, I40E_PFINT_LNKLSTN(i), val);
> > +   if (num_pf_int > 1) {
> > +   for (i = 0; i < num_pf_int - 2; i++)
> > +   wr32(hw, I40E_PFINT_LNKLSTN(i), val);
> > +   }
> > val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
> > for (i = 0; i < num_vfs; i++)
> > wr32(hw, I40E_VPINT_LNKLST0(i), val);
> > -   for (i = 0; i < num_vf_int - 2; i++)
> > -   wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> > +   if (num_vf_int > 1) {
> > +   for (i = 0; i < num_vf_int - 2; i++)
> > +   wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> > +   }
> >
> > /* warn the HW of the coming Tx disables */
> > for (i = 0; i < num_queues; i++) {
> >
> >
> > Could you check this?
> >
> > Best regards,
> > Kyungwook Boo

Re: [Intel-wired-lan] MMIO write access to an invalid page in i40e_clear_hw()

2025-03-05 Thread Loktionov, Aleksandr



> -Original Message-
> From: Intel-wired-lan  On Behalf Of
> Kyungwook Boo
> Sent: Monday, March 3, 2025 11:20 AM
> To: Nguyen, Anthony L ; Kitszel, Przemyslaw
> 
> Cc: intel-wired-...@lists.osuosl.org; linux-ker...@vger.kernel.org
> Subject: [Intel-wired-lan] MMIO write access to an invalid page in
> i40e_clear_hw()
> 
Please start commit title with 'fix' to explicitly tell what your patch do i.e. 
:
Ice: fix MMIO write access to an invalid page in  i40e_clear_hw


> Hello,
> 
> It seems that there are invalid page MMIO write access in i40e_clear_hw() due
> to an integer underflow from num_pf_int(also num_vf_int seems possible).
> 
> The following is a sample code in i40e_clear_hw():
> 
> val = rd32(hw, I40E_GLPCI_CNF2); // (1)
> num_pf_int = FIELD_GET(I40E_GLPCI_CNF2_MSI_X_PF_N_MASK, val); // (2)
> num_vf_int = FIELD_GET(I40E_GLPCI_CNF2_MSI_X_VF_N_MASK, val); ...
> for (i = 0; i < num_pf_int - 2; i++)  // (3)
>   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);  // (4) ...
> for (i = 0; i < num_pf_int - 2; i++)  // (5)
>   wr32(hw, I40E_PFINT_LNKLSTN(i), val);
> ...
> for (i = 0; i < num_vf_int - 2; i++)  // (6)
>   wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> 
> An example scenario for num_pf_int:
> (1) val = 0 (if MMIO read value was 0)
> (2) num_pf_int = 0 (also zero after bit field extraction from val)
> (3) An integer underflow occurs (num_pf_int - 2 == 0xfffe)
> (4) Out-of-bounds MMIO write access if access address exceeds the expected
> range.
> 
> From above example scenario, the maximum access offset value can be
> around
> 0x4000347f8(=172G) which seems like this underflow is not intended(also
> there are masking operations like (2) for num_pf_int), so I report this issue.
> 
> I think similar issue also could happen at (5) and (6).
> 
> The following is the patch method I propose:
> 
Please add Fixes: tag  
https://www.kernel.org/doc/html/latest/process/submitting-patches.html


> diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c
> b/drivers/net/ethernet/intel/i40e/i40e_common.c
> index 370b4bddee44..97ef79be39b3 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_common.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
> @@ -848,19 +848,25 @@ void i40e_clear_hw(struct i40e_hw *hw)
>   /* stop all the interrupts */
>   wr32(hw, I40E_PFINT_ICR0_ENA, 0);
>   val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
> - for (i = 0; i < num_pf_int - 2; i++)
> - wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
> + if (num_pf_int > 1) {
> + for (i = 0; i < num_pf_int - 2; i++)
> + wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
> + }
> 
>   /* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */
>   val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
>   wr32(hw, I40E_PFINT_LNKLST0, val);
> - for (i = 0; i < num_pf_int - 2; i++)
> - wr32(hw, I40E_PFINT_LNKLSTN(i), val);
> + if (num_pf_int > 1) {
> + for (i = 0; i < num_pf_int - 2; i++)
> + wr32(hw, I40E_PFINT_LNKLSTN(i), val);
> + }
Can you consider moving this if upper and use it once instead of duplicating 
the code?
I think it can help to maintain the code. What do you think?

>   val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
>   for (i = 0; i < num_vfs; i++)
>   wr32(hw, I40E_VPINT_LNKLST0(i), val);
> - for (i = 0; i < num_vf_int - 2; i++)
> - wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> + if (num_vf_int > 1) {
> + for (i = 0; i < num_vf_int - 2; i++)
> + wr32(hw, I40E_VPINT_LNKLSTN(i), val);
> + }
> 
>   /* warn the HW of the coming Tx disables */
>   for (i = 0; i < num_queues; i++) {
> 
> 
> Could you check this?
> 
> Best regards,
> Kyungwook Boo

Re: [Intel-wired-lan] [PATCH iwl-next v3 1/4] ixgbe: create E610 specific ethtool_ops structure

2025-03-05 Thread Simon Horman

On Mon, Mar 03, 2025 at 01:06:27PM +0100, Jedrzej Jagielski wrote:
> E610's implementation of various ethtool ops is different than
> the ones corresponding to ixgbe legacy products. Therefore create
> separate E610 ethtool_ops struct which will be filled out in the
> forthcoming patches.
> 
> Add adequate ops struct basing on MAC type. This step requires
> changing a bit the flow of probing by placing ixgbe_set_ethtool_ops
> after hw.mac.type is assigned. So move the whole netdev assignment
> block after hw.mac.type is known. This step doesn't have any additional
> impact on probing sequence.
> 
> Suggested-by: Aleksandr Loktionov 
> Reviewed-by: Aleksandr Loktionov 
> Signed-off-by: Jedrzej Jagielski 
> ---
> v3: correct the commit msg

Reviewed-by: Simon Horman

Re: [Intel-wired-lan] [PATCH iwl-next v3 2/4] ixgbe: add support for ACPI WOL for E610

2025-03-05 Thread Simon Horman

On Mon, Mar 03, 2025 at 01:06:28PM +0100, Jedrzej Jagielski wrote:
> Currently only APM (Advanced Power Management) is supported by
> the ixgbe driver. It works for magic packets only, as for different
> sources of wake-up E610 adapter utilizes different feature.
> 
> Add E610 specific implementation of ixgbe_set_wol() callback. When
> any of broadcast/multicast/unicast wake-up is set, disable APM and
> configure ACPI (Advanced Configuration and Power Interface).
> 
> Reviewed-by: Michal Swiatkowski 
> Reviewed-by: Aleksandr Loktionov 
> Signed-off-by: Jedrzej Jagielski 

Reviewed-by: Simon Horman

Re: [Intel-wired-lan] [PATCH iwl-next v3 3/4] ixgbe: apply different rules for setting FC on E610

2025-03-05 Thread Simon Horman

On Mon, Mar 03, 2025 at 01:06:29PM +0100, Jedrzej Jagielski wrote:
> E610 device doesn't support disabling FC autonegotiation.
> 
> Create dedicated E610 .set_pauseparam() implementation and assign
> it to ixgbe_ethtool_ops_e610.
> 
> Reviewed-by: Aleksandr Loktionov 
> Signed-off-by: Jedrzej Jagielski 

Reviewed-by: Simon Horman

Re: [Intel-wired-lan] [PATCH iwl-next v3 4/4] ixgbe: add E610 .set_phys_id() callback implementation

2025-03-05 Thread Simon Horman

On Mon, Mar 03, 2025 at 01:06:30PM +0100, Jedrzej Jagielski wrote:
> Legacy implementation of .set_phys_id() ethtool callback is not
> applicable for E610 device.
> 
> Add new implementation which uses 0x06E9 command by calling
> ixgbe_aci_set_port_id_led().
> 
> Reviewed-by: Aleksandr Loktionov 
> Reviewed-by: Michal Swiatkowski 
> Signed-off-by: Jedrzej Jagielski 
> ---
> v3: move the #defines related to ixgbe_aci_cmd_set_port_id_led out of the
> struct definition

Reviewed-by: Simon Horman

[Intel-wired-lan] igc: high init to link up time on I255

2025-03-05 Thread Mateusz Kusiak


Hi all,
I'm trying to figure out what's taking igc/I255 so long to establish link. I 
enabled debug logs hoping I'd find something meaningful. Here they are (output 
of "dmesg | grep igc"):


[    0.628173] calling  igc_init_module+0x0/0x3b @ 1
[    0.628234] igc :01:00.0: PCIe PTM not supported by PCIe bus/controller
[    0.654818] igc :01:00.0 (unnamed net_device) (uninitialized): Timeout is 
expired after a phy reset
[    0.655717] igc :01:00.0 (unnamed net_device) (uninitialized): Masking 
off all interrupts
[    0.675821] igc :01:00.0 (unnamed net_device) (uninitialized): Issuing a 
global reset to MAC

[    0.676061] igc :01:00.0 (unnamed net_device) (uninitialized): PHC added
[    0.676067] igc :01:00.0 (unnamed net_device) (uninitialized): Masking 
off all interrupts
[    0.696170] igc :01:00.0 (unnamed net_device) (uninitialized): Issuing a 
global reset to MAC
[    0.696175] igc :01:00.0 (unnamed net_device) (uninitialized): 
Programming MAC Address into RAR[0]
[    0.696179] igc :01:00.0 (unnamed net_device) (uninitialized): Clearing 
RAR[1-15]
[    0.696227] igc :01:00.0 (unnamed net_device) (uninitialized): Zeroing 
the MTA
[    0.696252] igc :01:00.0 (unnamed net_device) (uninitialized): Zeroing 
the UTA
[    0.696270] igc :01:00.0 (unnamed net_device) (uninitialized): After 
fix-ups FlowControl is now = 3
[    0.696272] igc :01:00.0 (unnamed net_device) (uninitialized): 
Reconfiguring auto-neg advertisement params
[    0.696808] igc :01:00.0 (unnamed net_device) (uninitialized): 
autoneg_advertised af
[    0.696809] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
10mb Half duplex
[    0.696809] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
10mb Full duplex
[    0.696810] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
100mb Half duplex
[    0.696810] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
100mb Full duplex
[    0.696811] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
1000mb Full duplex
[    0.696811] igc :01:00.0 (unnamed net_device) (uninitialized): Advertise 
2500mb Full duplex
[    0.696888] igc :01:00.0 (unnamed net_device) (uninitialized): Auto-Neg 
Advertising de1
[    0.697348] igc :01:00.0 (unnamed net_device) (uninitialized): Restarting 
Auto-Neg
[    0.699152] igc :01:00.0 (unnamed net_device) (uninitialized): Unable to 
establish link!!!
[    0.699153] igc :01:00.0 (unnamed net_device) (uninitialized): 
Initializing the Flow Control address, type and timer regs
[    0.701455] igc :01:00.0: 4.000 Gb/s available PCIe bandwidth (5.0 GT/s 
PCIe x1 link)

[    0.701457] igc :01:00.0 eth0: MAC: 
[    0.701469] initcall igc_init_module+0x0/0x3b returned 0 after 71577 usecs
[    1.094704] igc :01:00.0 eth0: changing MTU from 1500 to 9000
[    1.094720] igc :01:00.0 eth0: Masking off all interrupts
[    1.114868] igc :01:00.0 eth0: Issuing a global reset to MAC
[    1.114871] igc :01:00.0 eth0: Programming MAC Address into RAR[0]
[    1.114875] igc :01:00.0 eth0: Clearing RAR[1-15]
[    1.114923] igc :01:00.0 eth0: Zeroing the MTA
[    1.114948] igc :01:00.0 eth0: Zeroing the UTA
[    1.114967] igc :01:00.0 eth0: After fix-ups FlowControl is now = 3
[    1.114969] igc :01:00.0 eth0: Reconfiguring auto-neg advertisement 
params
[    1.115505] igc :01:00.0 eth0: autoneg_advertised af
[    1.115505] igc :01:00.0 eth0: Advertise 10mb Half duplex
[    1.115506] igc :01:00.0 eth0: Advertise 10mb Full duplex
[    1.115506] igc :01:00.0 eth0: Advertise 100mb Half duplex
[    1.115506] igc :01:00.0 eth0: Advertise 100mb Full duplex
[    1.115507] igc :01:00.0 eth0: Advertise 1000mb Full duplex
[    1.115508] igc :01:00.0 eth0: Advertise 2500mb Full duplex
[    1.115585] igc :01:00.0 eth0: Auto-Neg Advertising de1
[    1.116044] igc :01:00.0 eth0: Restarting Auto-Neg
[    1.117847] igc :01:00.0 eth0: Unable to establish link!!!
[    1.117847] igc :01:00.0 eth0: Initializing the Flow Control address, 
type and timer regs

[    1.162956] igc :01:00.0 eth0: Timeout is expired after a phy reset
[    1.172645] igc :01:00.0 eth0: After fix-ups FlowControl is now = 3
[    1.172647] igc :01:00.0 eth0: Reconfiguring auto-neg advertisement 
params
[    1.173183] igc :01:00.0 eth0: autoneg_advertised af
[    1.173183] igc :01:00.0 eth0: Advertise 10mb Half duplex
[    1.173184] igc :01:00.0 eth0: Advertise 10mb Full duplex
[    1.173184] igc :01:00.0 eth0: Advertise 100mb Half duplex
[    1.173185] igc :01:00.0 eth0: Advertise 100mb Full duplex
[    1.173185] igc :01:00.0 eth0: Advertise 1000mb Full duplex
[    1.173185] igc :01:00.0 eth0: Advertise 2500mb Full duplex
[    1.173262] igc :01:00.0 eth0: Auto-Neg Advertising de1
[    1.173721] igc :01:00.0 eth0: Restarting Auto-Neg
[    1.175529] igc 000

Re: [Intel-wired-lan] [iwl-net v2 5/5] ice: fix using untrusted value of pkt_len in ice_vc_fdir_parse_raw()

2025-03-05 Thread Simon Horman

On Mon, Mar 03, 2025 at 11:00:35AM +0100, Przemek Kitszel wrote:
> On 2/28/25 18:17, Simon Horman wrote:
> > On Tue, Feb 25, 2025 at 10:08:49AM +0100, Martyna Szapar-Mudlaw wrote:
> > > From: Mateusz Polchlopek 
> > > 
> > > Fix using the untrusted value of proto->raw.pkt_len in function
> > > ice_vc_fdir_parse_raw() by verifying if it does not exceed the
> > > VIRTCHNL_MAX_SIZE_RAW_PACKET value.
> > > 
> > > Fixes: 99f419df8a5c ("ice: enable FDIR filters from raw binary patterns 
> > > for VFs")
> > > Signed-off-by: Mateusz Polchlopek 
> > > Signed-off-by: Martyna Szapar-Mudlaw 
> > > 
> > > ---
> > >   .../ethernet/intel/ice/ice_virtchnl_fdir.c| 25 +--
> > >   1 file changed, 17 insertions(+), 8 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c 
> > > b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
> > > index 14e3f0f89c78..6250629ee8f9 100644
> > > --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
> > > +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
> > > @@ -835,18 +835,27 @@ ice_vc_fdir_parse_raw(struct ice_vf *vf,
> > >   u8 *pkt_buf, *msk_buf __free(kfree);
> > >   struct ice_parser_result rslt;
> > >   struct ice_pf *pf = vf->pf;
> > > + u16 pkt_len, udp_port = 0;
> > >   struct ice_parser *psr;
> > >   int status = -ENOMEM;
> > >   struct ice_hw *hw;
> > > - u16 udp_port = 0;
> > > - pkt_buf = kzalloc(proto->raw.pkt_len, GFP_KERNEL);
> > > - msk_buf = kzalloc(proto->raw.pkt_len, GFP_KERNEL);
> > > + if (!proto->raw.pkt_len)
> > > + return -EINVAL;
> > 
> > Hi Martyna,
> > 
> > It seems to me that the use of __free() above will result in
> > kfree(msk_buf) being called here. But msk_buf is not initialised at this
> > point.
> > 
> > My suggest would be to drop the use of __free().
> > But if not, I think that in order to be safe it would be best to do this
> > (completely untested;
> > 
> > u8 *pkt_buf, *msk_buf __free(kfree) = NULL;
> 
> Oh yeah!, thank you Simon for catching that.
> 
> I would say "naked __free()" was harmful here.

Yes, quite.

Re: [Intel-wired-lan] [PATCH iwl-next v3] igc: Change Tx mode for MQPRIO offloading

2025-03-05 Thread Kurt Kanzenbach

On Tue Mar 04 2025, Simon Horman wrote:
> On Mon, Mar 03, 2025 at 10:16:33AM +0100, Kurt Kanzenbach wrote:
>> The current MQPRIO offload implementation uses the legacy TSN Tx mode. In
>> this mode the hardware uses four packet buffers and considers queue
>> priorities.
>> 
>> In order to harmonize the TAPRIO implementation with MQPRIO, switch to the
>> regular TSN Tx mode. This mode also uses four packet buffers and considers
>> queue priorities. In addition to the legacy mode, transmission is always
>> coupled to Qbv. The driver already has mechanisms to use a dummy schedule
>> of 1 second with all gates open for ETF. Simply use this for MQPRIO too.
>> 
>> This reduces code and makes it easier to add support for frame preemption
>> later.
>> 
>> While at it limit the netdev_tc calls to MQPRIO only.
>
> Hi Kurt,
>
> Can this part be broken out into a separate patch?
> It seems so to me, but perhaps I'm missing something.
>
> The reason that I ask is that this appears to be a good portion of the
> change, and doing so would make the code changes for main part of the
> patch, as per the description prior to the line above, clearer IMHO.

Sure, i think it can be broken out into a dedicated patch. I'll see what
I can come up with.

Thanks,
Kurt


signature.asc
Description: PGP signature

Re: [Intel-wired-lan] MMIO write access to an invalid page in i40e_clear_hw()

2025-03-05 Thread Przemek Kitszel


On 3/3/25 11:19, Kyungwook Boo wrote:

Hello,

It seems that there are invalid page MMIO write access in i40e_clear_hw()


Hi,

is this something that actually occurred, or just a theoretical bug?
(depending on that we will apply it to different tree)

please send a proper patch anyway, as it looks legit to don't go bananas
when HW gives you 0

(and CC netdev instead of generic kernel ML, perhaps that's the reason
this mail was tagged as spam for me)


due to an integer underflow from num_pf_int(also num_vf_int seems possible).

The following is a sample code in i40e_clear_hw():

val = rd32(hw, I40E_GLPCI_CNF2); // (1)
num_pf_int = FIELD_GET(I40E_GLPCI_CNF2_MSI_X_PF_N_MASK, val); // (2)
num_vf_int = FIELD_GET(I40E_GLPCI_CNF2_MSI_X_VF_N_MASK, val);
...
for (i = 0; i < num_pf_int - 2; i++)  // (3)
wr32(hw, I40E_PFINT_DYN_CTLN(i), val);  // (4)
...
for (i = 0; i < num_pf_int - 2; i++) // (5)
wr32(hw, I40E_PFINT_LNKLSTN(i), val);
...
for (i = 0; i < num_vf_int - 2; i++) // (6)
wr32(hw, I40E_VPINT_LNKLSTN(i), val);

An example scenario for num_pf_int:
(1) val = 0 (if MMIO read value was 0)
(2) num_pf_int = 0 (also zero after bit field extraction from val)
(3) An integer underflow occurs (num_pf_int - 2 == 0xfffe)
(4) Out-of-bounds MMIO write access if access address exceeds the expected
range.

 From above example scenario, the maximum access offset value can be around
0x4000347f8(=172G) which seems like this underflow is not intended(also there
are masking operations like (2) for num_pf_int), so I report this issue.

I think similar issue also could happen at (5) and (6).

The following is the patch method I propose:

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 370b4bddee44..97ef79be39b3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -848,19 +848,25 @@ void i40e_clear_hw(struct i40e_hw *hw)
/* stop all the interrupts */
wr32(hw, I40E_PFINT_ICR0_ENA, 0);
val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
-   for (i = 0; i < num_pf_int - 2; i++)
-   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
+   if (num_pf_int > 1) {


instead of adding if conditions, I would simply change the type
to be signed


+   for (i = 0; i < num_pf_int - 2; i++)
+   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
+   }
  
  	/* Set the FIRSTQ_INDX field to 0x7FF in PFINT_LNKLSTx */

val = eol << I40E_PFINT_LNKLST0_FIRSTQ_INDX_SHIFT;
wr32(hw, I40E_PFINT_LNKLST0, val);
-   for (i = 0; i < num_pf_int - 2; i++)
-   wr32(hw, I40E_PFINT_LNKLSTN(i), val);
+   if (num_pf_int > 1) {
+   for (i = 0; i < num_pf_int - 2; i++)
+   wr32(hw, I40E_PFINT_LNKLSTN(i), val);
+   }
val = eol << I40E_VPINT_LNKLST0_FIRSTQ_INDX_SHIFT;
for (i = 0; i < num_vfs; i++)
wr32(hw, I40E_VPINT_LNKLST0(i), val);
-   for (i = 0; i < num_vf_int - 2; i++)
-   wr32(hw, I40E_VPINT_LNKLSTN(i), val);
+   if (num_vf_int > 1) {
+   for (i = 0; i < num_vf_int - 2; i++)
+   wr32(hw, I40E_VPINT_LNKLSTN(i), val);
+   }
  
  	/* warn the HW of the coming Tx disables */

for (i = 0; i < num_queues; i++) {


Could you check this?

Best regards,
Kyungwook Boo

[Intel-wired-lan] [PATCH iwl-next v8 01/11] net: stmmac: move frag_size handling out of spin_lock

2025-03-05 Thread Faizal Rahim

The upcoming patch will extract verification logic into a new module,
MMSV (MAC Merge Software Verification). MMSV will handle most FPE fields,
except frag_size. It introduces its own lock (mmsv->lock), replacing
fpe_cfg->lock.

Since frag_size handling remains in the driver, the existing rtnl_lock()
is sufficient. Move frag_size handling out of spin_lock_irq_save() to keep
the upcoming patch a pure refactoring without behavior changes.

Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 918a32f8fda8..cfe5aea24549 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -1216,6 +1216,10 @@ static int stmmac_get_mm(struct net_device *ndev,
if (!stmmac_fpe_supported(priv))
return -EOPNOTSUPP;
 
+   state->rx_min_frag_size = ETH_ZLEN;
+   frag_size = stmmac_fpe_get_add_frag_size(priv);
+   state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size);
+
spin_lock_irqsave(&priv->fpe_cfg.lock, flags);
 
state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS;
@@ -1224,7 +1228,6 @@ static int stmmac_get_mm(struct net_device *ndev,
state->verify_time = priv->fpe_cfg.verify_time;
state->tx_enabled = priv->fpe_cfg.tx_enabled;
state->verify_status = priv->fpe_cfg.status;
-   state->rx_min_frag_size = ETH_ZLEN;
 
/* FPE active if common tx_enabled and
 * (verification success or disabled(forced))
@@ -1236,9 +1239,6 @@ static int stmmac_get_mm(struct net_device *ndev,
else
state->tx_active = false;
 
-   frag_size = stmmac_fpe_get_add_frag_size(priv);
-   state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size);
-
spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags);
 
return 0;
@@ -1258,6 +1258,8 @@ static int stmmac_set_mm(struct net_device *ndev, struct 
ethtool_mm_cfg *cfg,
if (err)
return err;
 
+   stmmac_fpe_set_add_frag_size(priv, frag_size);
+
/* Wait for the verification that's currently in progress to finish */
timer_shutdown_sync(&fpe_cfg->verify_timer);
 
@@ -1271,7 +1273,6 @@ static int stmmac_set_mm(struct net_device *ndev, struct 
ethtool_mm_cfg *cfg,
if (!cfg->verify_enabled)
fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
 
-   stmmac_fpe_set_add_frag_size(priv, frag_size);
stmmac_fpe_apply(priv);
 
spin_unlock_irqrestore(&fpe_cfg->lock, flags);
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 07/11] igc: add support for frame preemption verification

2025-03-05 Thread Faizal Rahim

This patch implements the "ethtool --set-mm" callback to trigger the
frame preemption verification handshake.

Uses the MAC Merge Software Verification (mmsv) mechanism in ethtool
to perform the verification handshake for igc.
The structure fpe.mmsv is set by mmsv in ethtool and should remain
read-only for the driver.

Other mmsv callbacks:
a) configure_tx() -> not used yet at this point
   - igc lacks registers to configure FPE in the transmit direction, so
 this API is not utilized for now. A future patch will use it to
 control preemptible queue config.

b) configure_pmac() -> not used
   - this callback dynamically controls pmac_enabled at runtime. For
 example, mmsv calls configure_pmac() and disables pmac_enabled when
 the link partner goes down, even if the user previously enabled it.
 The intention is to save power but it is not feasible in igc
 because it causes an endless adapter reset loop:

   1) Board A and Board B complete the verification handshake. Tx mode
  register for both boards are in TSN mode.
   2) Board B link goes down.

   On Board A:
   3) mmsv calls configure_pmac() with pmac_enabled = false.
   4) configure_pmac() in igc updates a new field based on pmac_enabled.
  Driver uses this field in igc_tsn_new_flags() to indicate that the
  user enabled/disabled FPE.
   5) configure_pmac() in igc calls igc_tsn_offload_apply() to check
  whether an adapter reset is needed. Calls existing logic in
  igc_tsn_will_tx_mode_change() and igc_tsn_new_flags().
   6) Since pmac_enabled is now disabled and no other TSN feature is
  active, igc_tsn_will_tx_mode_change() evaluates to true because Tx
  mode will switch from TSN to Legacy.
   7) Driver resets the adapter.
   8) Registers are set, and Tx mode switches to Legacy.
   9) When link partner is up, steps 3–8 repeat, but this time with
  pmac_enabled = true, reactivating TSN.
  igc_tsn_will_tx_mode_change() evaluates to true again, since Tx
  mode will switch from Legacy to TSN.
  10) Driver resets the adapter.
  11) Rest adapter completes, registers are set, and Tx mode switches to
  TSN.

  On Board B:
  12) Adapter reset on Board A at step 10 causes it to detect its link
  partner as down.
  13) Repeats steps 3–8.
  14) Once reset adapter on Board A is completed at step 11, it detects
  its link partner as up.
  15) Repeats steps 9–11.

   - this cycle repeats indefinitely. To avoid this issue, igc only uses
 mmsv.pmac_enabled to track whether FPE is enabled or disabled.

Co-developed-by: Vinicius Costa Gomes 
Signed-off-by: Vinicius Costa Gomes 
Co-developed-by: Choong Yong Liang 
Signed-off-by: Choong Yong Liang 
Co-developed-by: Chwee-Lin Choong 
Signed-off-by: Chwee-Lin Choong 
Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc.h |  12 +-
 drivers/net/ethernet/intel/igc/igc_base.h|   1 +
 drivers/net/ethernet/intel/igc/igc_defines.h |   8 +-
 drivers/net/ethernet/intel/igc/igc_ethtool.c |  21 +++
 drivers/net/ethernet/intel/igc/igc_main.c|  53 ++-
 drivers/net/ethernet/intel/igc/igc_tsn.c | 146 ++-
 drivers/net/ethernet/intel/igc/igc_tsn.h |  53 +++
 7 files changed, 289 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h 
b/drivers/net/ethernet/intel/igc/igc.h
index 22ecdac26cf4..d9ecb7cf80c9 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -40,6 +40,10 @@ void igc_ethtool_set_ops(struct net_device *);
 
 #define IGC_MAX_TX_TSTAMP_REGS 4
 
+struct igc_fpe_t {
+   struct ethtool_mmsv mmsv;
+};
+
 enum igc_mac_filter_type {
IGC_MAC_FILTER_TYPE_DST = 0,
IGC_MAC_FILTER_TYPE_SRC
@@ -332,6 +336,8 @@ struct igc_adapter {
struct timespec64 period;
} perout[IGC_N_PEROUT];
 
+   struct igc_fpe_t fpe;
+
/* LEDs */
struct mutex led_mutex;
struct igc_led_classdev *leds;
@@ -389,10 +395,11 @@ extern char igc_driver_name[];
 #define IGC_FLAG_TSN_QBV_ENABLED   BIT(17)
 #define IGC_FLAG_TSN_QAV_ENABLED   BIT(18)
 #define IGC_FLAG_TSN_LEGACY_ENABLEDBIT(19)
+#define IGC_FLAG_TSN_PREEMPT_ENABLED   BIT(20)
 
 #define IGC_FLAG_TSN_ANY_ENABLED   \
(IGC_FLAG_TSN_QBV_ENABLED | IGC_FLAG_TSN_QAV_ENABLED |  \
-IGC_FLAG_TSN_LEGACY_ENABLED)
+IGC_FLAG_TSN_LEGACY_ENABLED | IGC_FLAG_TSN_PREEMPT_ENABLED)
 
 #define IGC_FLAG_RSS_FIELD_IPV4_UDPBIT(6)
 #define IGC_FLAG_RSS_FIELD_IPV6_UDPBIT(7)
@@ -736,7 +743,10 @@ struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter 
*adapter,
  u32 location);
 int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
 void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
+void igc_disable_empty_addr_recv(struct igc_adapter *adapter);
+int igc_enable_empty_addr_recv(struct igc_adapter *

[Intel-wired-lan] [PATCH iwl-next v8 10/11] igc: add support to get MAC Merge data via ethtool

2025-03-05 Thread Faizal Rahim

Implement "ethtool --show-mm" callback for IGC.

Tested with command:
$ ethtool --show-mm enp1s0.
  MAC Merge layer state for enp1s0:
  pMAC enabled: on
  TX enabled: on
  TX active: on
  TX minimum fragment size: 64
  RX minimum fragment size: 60
  Verify enabled: on
  Verify time: 128
  Max verify time: 128
  Verification status: SUCCEEDED

Verified that the fields value are retrieved correctly.

Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc_ethtool.c | 14 ++
 drivers/net/ethernet/intel/igc/igc_tsn.h |  1 +
 2 files changed, 15 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c 
b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index 529654ccd83f..fd4b4b332309 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1782,6 +1782,19 @@ static int igc_ethtool_set_eee(struct net_device *netdev,
return 0;
 }
 
+static int igc_ethtool_get_mm(struct net_device *netdev,
+ struct ethtool_mm_state *cmd)
+{
+   struct igc_adapter *adapter = netdev_priv(netdev);
+   struct igc_fpe_t *fpe = &adapter->fpe;
+
+   ethtool_mmsv_get_mm(&fpe->mmsv, cmd);
+   cmd->tx_min_frag_size = fpe->tx_min_frag_size;
+   cmd->rx_min_frag_size = IGC_RX_MIN_FRAG_SIZE;
+
+   return 0;
+}
+
 static int igc_ethtool_set_mm(struct net_device *netdev,
  struct ethtool_mm_cfg *cmd,
  struct netlink_ext_ack *extack)
@@ -2101,6 +2114,7 @@ static const struct ethtool_ops igc_ethtool_ops = {
.get_link_ksettings = igc_ethtool_get_link_ksettings,
.set_link_ksettings = igc_ethtool_set_link_ksettings,
.self_test  = igc_ethtool_diag_test,
+   .get_mm = igc_ethtool_get_mm,
.set_mm = igc_ethtool_set_mm,
 };
 
diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h 
b/drivers/net/ethernet/intel/igc/igc_tsn.h
index 975f4e38836e..2b885f98e720 100644
--- a/drivers/net/ethernet/intel/igc/igc_tsn.h
+++ b/drivers/net/ethernet/intel/igc/igc_tsn.h
@@ -4,6 +4,7 @@
 #ifndef _IGC_TSN_H_
 #define _IGC_TSN_H_
 
+#define IGC_RX_MIN_FRAG_SIZE   60
 #define SMD_FRAME_SIZE 60
 
 enum igc_txd_popts_type {
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 06/11] igc: set the RX packet buffer size for TSN mode

2025-03-05 Thread Faizal Rahim

In preparation for supporting frame preemption, when entering TSN mode
set the receive packet buffer to 16KB for the Express MAC, 16KB for
the Preemptible MAC and 2KB for the BMC, according to the datasheet
section 7.1.3.2.

Co-developed-by: Vinicius Costa Gomes 
Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc_defines.h |  3 +++
 drivers/net/ethernet/intel/igc/igc_tsn.c | 13 +++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
b/drivers/net/ethernet/intel/igc/igc_defines.h
index 516ef70c98e9..b19ac6f30dac 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -402,6 +402,9 @@
 
  /* 7KB bytes buffer for each tx queue (total 4 queues) + 4KB for BMC*/
 #define IGC_TXPBSIZE_TSN   0x041c71c7
+/* 15KB for EXP + 15KB for BE + 2KB for BMC */
+#define IGC_RXPBSIZE_TSN   0xf08f
+#define IGC_RXPBSIZE_SIZE_MASK 0x0001
 
 #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */
 #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */
diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c 
b/drivers/net/ethernet/intel/igc/igc_tsn.c
index 1e44374ca1ff..f0213cfce07d 100644
--- a/drivers/net/ethernet/intel/igc/igc_tsn.c
+++ b/drivers/net/ethernet/intel/igc/igc_tsn.c
@@ -132,13 +132,17 @@ static int igc_tsn_disable_offload(struct igc_adapter 
*adapter)
 {
u16 queue_per_tc[4] = { 3, 2, 1, 0 };
struct igc_hw *hw = &adapter->hw;
-   u32 tqavctrl;
+   u32 tqavctrl, rxpbs;
int i;
 
wr32(IGC_GTXOFFSET, 0);
wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT);
wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT);
 
+   rxpbs = rd32(IGC_RXPBS) & ~IGC_RXPBSIZE_SIZE_MASK;
+   rxpbs |= I225_RXPBSIZE_DEFAULT;
+   wr32(IGC_RXPBS, rxpbs);
+
if (igc_is_device_id_i226(hw))
igc_tsn_restore_retx_default(adapter);
 
@@ -194,7 +198,7 @@ static int igc_tsn_enable_offload(struct igc_adapter 
*adapter)
 {
struct igc_hw *hw = &adapter->hw;
u32 tqavctrl, baset_l, baset_h;
-   u32 sec, nsec, cycle;
+   u32 sec, nsec, cycle, rxpbs;
ktime_t base_time, systim;
int i;
 
@@ -202,6 +206,11 @@ static int igc_tsn_enable_offload(struct igc_adapter 
*adapter)
wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_TSN);
wr32(IGC_TXPBS, IGC_TXPBSIZE_TSN);
 
+   rxpbs = rd32(IGC_RXPBS) & ~IGC_RXPBSIZE_SIZE_MASK;
+   rxpbs |= IGC_RXPBSIZE_TSN;
+
+   wr32(IGC_RXPBS, rxpbs);
+
if (igc_is_device_id_i226(hw))
igc_tsn_set_retx_qbvfullthreshold(adapter);
 
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 02/11] net: ethtool: mm: extract stmmac verification logic into common library

2025-03-05 Thread Faizal Rahim

From: Vladimir Oltean 

It appears that stmmac is not the only hardware which requires a
software-driven verification state machine for the MAC Merge layer.

While on the one hand it's good to encourage hardware implementations,
on the other hand it's quite difficult to tolerate multiple drivers
implementing independently fairly non-trivial logic.

Extract the hardware-independent logic from stmmac into library code and
put it in ethtool. Name the state structure "mmsv" for MAC Merge
Software Verification. Let this expose an operations structure for
executing the hardware stuff: sync hardware with the tx_active boolean
(result of verification process), enable/disable the pMAC, send mPackets,
notify library of external events (reception of mPackets), as well as
link state changes.

Note that it is assumed that the external events are received in hardirq
context. If they are not, it is probably a good idea to disable hardirqs
when calling ethtool_mmsv_event_handle(), because the library does not
do so.

Also, the MM software verification process has no business with the
tx_min_frag_size, that is all the driver's to handle.

Signed-off-by: Vladimir Oltean 
Co-developed-by: Choong Yong Liang 
Signed-off-by: Choong Yong Liang 
Co-developed-by: Faizal Rahim 
Signed-off-by: Faizal Rahim 
Tested-by: Choong Yong Liang 
Tested-by: Furong Xu <0x1...@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  16 +-
 .../ethernet/stmicro/stmmac/stmmac_ethtool.c  |  42 +--
 .../net/ethernet/stmicro/stmmac/stmmac_fpe.c  | 174 +++
 .../net/ethernet/stmicro/stmmac/stmmac_fpe.h  |   5 -
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |   8 +-
 include/linux/ethtool.h   |  73 +
 net/ethtool/mm.c  | 278 +-
 7 files changed, 394 insertions(+), 202 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h 
b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index f05cae103d83..c9cc41af258a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -147,21 +147,9 @@ struct stmmac_channel {
 };
 
 struct stmmac_fpe_cfg {
-   /* Serialize access to MAC Merge state between ethtool requests
-* and link state updates.
-*/
-   spinlock_t lock;
-
+   struct ethtool_mmsv mmsv;
const struct stmmac_fpe_reg *reg;
-   u32 fpe_csr;/* MAC_FPE_CTRL_STS reg cache */
-
-   enum ethtool_mm_verify_status status;
-   struct timer_list verify_timer;
-   bool verify_enabled;
-   int verify_retries;
-   bool pmac_enabled;
-   u32 verify_time;
-   bool tx_enabled;
+   u32 fpe_csr;/* MAC_FPE_CTRL_STS reg cache */
 };
 
 struct stmmac_tc_entry {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index cfe5aea24549..44ee73569cb1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -1210,7 +1210,6 @@ static int stmmac_get_mm(struct net_device *ndev,
 struct ethtool_mm_state *state)
 {
struct stmmac_priv *priv = netdev_priv(ndev);
-   unsigned long flags;
u32 frag_size;
 
if (!stmmac_fpe_supported(priv))
@@ -1220,26 +1219,7 @@ static int stmmac_get_mm(struct net_device *ndev,
frag_size = stmmac_fpe_get_add_frag_size(priv);
state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size);
 
-   spin_lock_irqsave(&priv->fpe_cfg.lock, flags);
-
-   state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS;
-   state->verify_enabled = priv->fpe_cfg.verify_enabled;
-   state->pmac_enabled = priv->fpe_cfg.pmac_enabled;
-   state->verify_time = priv->fpe_cfg.verify_time;
-   state->tx_enabled = priv->fpe_cfg.tx_enabled;
-   state->verify_status = priv->fpe_cfg.status;
-
-   /* FPE active if common tx_enabled and
-* (verification success or disabled(forced))
-*/
-   if (state->tx_enabled &&
-   (state->verify_status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED ||
-state->verify_status == ETHTOOL_MM_VERIFY_STATUS_DISABLED))
-   state->tx_active = true;
-   else
-   state->tx_active = false;
-
-   spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags);
+   ethtool_mmsv_get_mm(&priv->fpe_cfg.mmsv, state);
 
return 0;
 }
@@ -1248,8 +1228,6 @@ static int stmmac_set_mm(struct net_device *ndev, struct 
ethtool_mm_cfg *cfg,
 struct netlink_ext_ack *extack)
 {
struct stmmac_priv *priv = netdev_priv(ndev);
-   struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg;
-   unsigned long flags;
u32 frag_size;
int err;
 
@@ -1259,23 +1237,7 @@ static int stmmac_set_mm(struct net_device *ndev, struct 
ethtool_mm_cfg *cfg,
return err;
 
stmmac_fpe_set_a

[Intel-wired-lan] [PATCH iwl-next v8 09/11] igc: block setting preemptible traffic class in taprio

2025-03-05 Thread Faizal Rahim

Since preemptible tc implementation is not ready yet, block it from being
set in taprio. The existing code already blocks it in mqprio.

Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc_main.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c 
b/drivers/net/ethernet/intel/igc/igc_main.c
index a9f40fffc4fd..6db926bc9d11 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -6407,6 +6407,10 @@ static int igc_save_qbv_schedule(struct igc_adapter 
*adapter,
if (!validate_schedule(adapter, qopt))
return -EINVAL;
 
+   /* preemptible isn't supported yet */
+   if (qopt->mqprio.preemptible_tcs)
+   return -EOPNOTSUPP;
+
igc_ptp_read(adapter, &now);
 
if (igc_tsn_is_taprio_activated_by_user(adapter) &&
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 05/11] igc: optimize the TX packet buffer utilization

2025-03-05 Thread Faizal Rahim

Packet buffers (RX + TX) total 64KB. Neither RX or TX buffers can be
larger than 34KB. So divide the buffer equally, 32KB for each.

Co-developed-by: Vinicius Costa Gomes 
Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc_defines.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
b/drivers/net/ethernet/intel/igc/igc_defines.h
index 8e449904aa7d..516ef70c98e9 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -400,7 +400,8 @@
 #define I225_TXPBSIZE_DEFAULT  0x0414 /* TXPBSIZE default */
 #define IGC_RXPBS_CFG_TS_EN0x8000 /* Timestamp in Rx buffer */
 
-#define IGC_TXPBSIZE_TSN   0x04145145 /* 5k bytes buffer for each queue */
+ /* 7KB bytes buffer for each tx queue (total 4 queues) + 4KB for BMC*/
+#define IGC_TXPBSIZE_TSN   0x041c71c7
 
 #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */
 #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 08/11] igc: add support to set tx-min-frag-size

2025-03-05 Thread Faizal Rahim

Add support to set tx-min-frag-size via set_mm callback in igc.
Increase the max limit of tx-ming-frag-size in ethtool from 252 to 256
since i225/6 value range is 64, 128, 192 and 256.

Co-developed-by: Vinicius Costa Gomes 
Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc.h |  1 +
 drivers/net/ethernet/intel/igc/igc_defines.h |  1 +
 drivers/net/ethernet/intel/igc/igc_ethtool.c |  5 +++
 drivers/net/ethernet/intel/igc/igc_tsn.c | 37 ++--
 drivers/net/ethernet/intel/igc/igc_tsn.h |  2 +-
 net/ethtool/mm.c |  2 +-
 6 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h 
b/drivers/net/ethernet/intel/igc/igc.h
index d9ecb7cf80c9..4dfd133b4d6f 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -42,6 +42,7 @@ void igc_ethtool_set_ops(struct net_device *);
 
 struct igc_fpe_t {
struct ethtool_mmsv mmsv;
+   u32 tx_min_frag_size;
 };
 
 enum igc_mac_filter_type {
diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
b/drivers/net/ethernet/intel/igc/igc_defines.h
index 22db1de02964..038ee89f1e08 100644
--- a/drivers/net/ethernet/intel/igc/igc_defines.h
+++ b/drivers/net/ethernet/intel/igc/igc_defines.h
@@ -551,6 +551,7 @@
 #define IGC_TQAVCTRL_PREEMPT_ENA   0x0002
 #define IGC_TQAVCTRL_ENHANCED_QAV  0x0008
 #define IGC_TQAVCTRL_FUTSCDDIS 0x0080
+#define IGC_TQAVCTRL_MIN_FRAG_MASK 0xC000
 
 #define IGC_TXQCTL_QUEUE_MODE_LAUNCHT  0x0001
 #define IGC_TXQCTL_STRICT_CYCLE0x0002
diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c 
b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index b64d5c6c1d20..529654ccd83f 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1789,6 +1789,11 @@ static int igc_ethtool_set_mm(struct net_device *netdev,
struct igc_adapter *adapter = netdev_priv(netdev);
struct igc_fpe_t *fpe = &adapter->fpe;
 
+   fpe->tx_min_frag_size = 
igc_fpe_get_supported_frag_size(cmd->tx_min_frag_size);
+   if (fpe->tx_min_frag_size != cmd->tx_min_frag_size)
+   NL_SET_ERR_MSG_MOD(extack,
+  "tx-min-frag-size value set is unsupported. 
Rounded up to supported value (64, 128, 192, 256)");
+
if (fpe->mmsv.pmac_enabled != cmd->pmac_enabled) {
if (cmd->pmac_enabled)
static_branch_inc(&igc_fpe_enabled);
diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c 
b/drivers/net/ethernet/intel/igc/igc_tsn.c
index 0a2c747fde2d..2ec5909bf8b0 100644
--- a/drivers/net/ethernet/intel/igc/igc_tsn.c
+++ b/drivers/net/ethernet/intel/igc/igc_tsn.c
@@ -6,6 +6,12 @@
 #include "igc_hw.h"
 #include "igc_tsn.h"
 
+#define MIN_MULTPLIER_TX_MIN_FRAG  0
+#define MAX_MULTPLIER_TX_MIN_FRAG  3
+/* Frag size is based on the Section 8.12.2 of the SW User Manual */
+#define TX_MIN_FRAG_SIZE   64
+#define TX_MAX_FRAG_SIZE   (TX_MIN_FRAG_SIZE * (MAX_MULTPLIER_TX_MIN_FRAG 
+ 1))
+
 DEFINE_STATIC_KEY_FALSE(igc_fpe_enabled);
 
 static int igc_fpe_init_smd_frame(struct igc_ring *ring,
@@ -128,6 +134,7 @@ static const struct ethtool_mmsv_ops igc_mmsv_ops = {
 
 void igc_fpe_init(struct igc_adapter *adapter)
 {
+   adapter->fpe.tx_min_frag_size = TX_MIN_FRAG_SIZE;
ethtool_mmsv_init(&adapter->fpe.mmsv, adapter->netdev, &igc_mmsv_ops);
 }
 
@@ -278,7 +285,7 @@ static int igc_tsn_disable_offload(struct igc_adapter 
*adapter)
tqavctrl = rd32(IGC_TQAVCTRL);
tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN |
  IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS |
- IGC_TQAVCTRL_PREEMPT_ENA);
+ IGC_TQAVCTRL_PREEMPT_ENA | IGC_TQAVCTRL_MIN_FRAG_MASK);
 
wr32(IGC_TQAVCTRL, tqavctrl);
 
@@ -324,12 +331,34 @@ static void igc_tsn_set_retx_qbvfullthreshold(struct 
igc_adapter *adapter)
wr32(IGC_RETX_CTL, retxctl);
 }
 
+static u8 igc_fpe_get_frag_size_mult(const struct igc_fpe_t *fpe)
+{
+   u8 mult = (fpe->tx_min_frag_size / TX_MIN_FRAG_SIZE) - 1;
+
+   return clamp_t(u8, mult, MIN_MULTPLIER_TX_MIN_FRAG,
+  MAX_MULTPLIER_TX_MIN_FRAG);
+}
+
+u32 igc_fpe_get_supported_frag_size(u32 frag_size)
+{
+   const u32 supported_sizes[] = {64, 128, 192, 256};
+
+   /* Find the smallest supported size that is >= frag_size */
+   for (int i = 0; i < ARRAY_SIZE(supported_sizes); i++) {
+   if (frag_size <= supported_sizes[i])
+   return supported_sizes[i];
+   }
+
+   return TX_MAX_FRAG_SIZE; /* Should not happen, value > 256 is blocked 
by ethtool */
+}
+
 static int igc_tsn_enable_offload(struct igc_adapter *adapter)
 {
struct igc_hw *hw = &adapter->hw;
u32 tqavctrl, baset_l, baset_h;
u32 sec, nsec, cy

[Intel-wired-lan] [PATCH iwl-next v8 04/11] igc: rename xdp_get_tx_ring() for non-xdp usage

2025-03-05 Thread Faizal Rahim

Renamed xdp_get_tx_ring() function to a more generic name for use in
upcoming frame preemption patches.

Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc.h  | 2 +-
 drivers/net/ethernet/intel/igc/igc_main.c | 9 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h 
b/drivers/net/ethernet/intel/igc/igc.h
index b8111ad9a9a8..22ecdac26cf4 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -736,7 +736,7 @@ struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter 
*adapter,
  u32 location);
 int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
 void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule);
-
+struct igc_ring *igc_get_tx_ring(struct igc_adapter *adapter, int cpu);
 void igc_ptp_init(struct igc_adapter *adapter);
 void igc_ptp_reset(struct igc_adapter *adapter);
 void igc_ptp_suspend(struct igc_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c 
b/drivers/net/ethernet/intel/igc/igc_main.c
index 56a35d58e7a6..db4a36afcec6 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2444,8 +2444,7 @@ static int igc_xdp_init_tx_descriptor(struct igc_ring 
*ring,
return -ENOMEM;
 }
 
-static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter,
-   int cpu)
+struct igc_ring *igc_get_tx_ring(struct igc_adapter *adapter, int cpu)
 {
int index = cpu;
 
@@ -2469,7 +2468,7 @@ static int igc_xdp_xmit_back(struct igc_adapter *adapter, 
struct xdp_buff *xdp)
if (unlikely(!xdpf))
return -EFAULT;
 
-   ring = igc_xdp_get_tx_ring(adapter, cpu);
+   ring = igc_get_tx_ring(adapter, cpu);
nq = txring_txq(ring);
 
__netif_tx_lock(nq, cpu);
@@ -2546,7 +2545,7 @@ static void igc_finalize_xdp(struct igc_adapter *adapter, 
int status)
struct igc_ring *ring;
 
if (status & IGC_XDP_TX) {
-   ring = igc_xdp_get_tx_ring(adapter, cpu);
+   ring = igc_get_tx_ring(adapter, cpu);
nq = txring_txq(ring);
 
__netif_tx_lock(nq, cpu);
@@ -6699,7 +6698,7 @@ static int igc_xdp_xmit(struct net_device *dev, int 
num_frames,
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
 
-   ring = igc_xdp_get_tx_ring(adapter, cpu);
+   ring = igc_get_tx_ring(adapter, cpu);
nq = txring_txq(ring);
 
__netif_tx_lock(nq, cpu);
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-next v8 03/11] net: ethtool: mm: reset verification status when link is down

2025-03-05 Thread Faizal Rahim

When the link partner goes down, "ethtool --show-mm" still displays
"Verification status: SUCCEEDED," reflecting a previous state that is
no longer valid.

Reset the verification status to ensure it reflects the current state.

Signed-off-by: Faizal Rahim 
---
 net/ethtool/mm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index aa43df2ecac0..ad9b40034003 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -415,8 +415,9 @@ void ethtool_mmsv_link_state_handle(struct ethtool_mmsv 
*mmsv, bool up)
/* New link => maybe new partner => new verification process */
ethtool_mmsv_apply(mmsv);
} else {
-   mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
-   mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+   /* Reset the reported verification state while the link is down 
*/
+   if (mmsv->verify_enabled)
+   mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
 
/* No link or pMAC not enabled */
ethtool_mmsv_configure_pmac(mmsv, false);
-- 
2.34.1

[Intel-wired-lan] [PATCH net-next 03/16] libeth: add a couple of XDP helpers (libeth_xdp)

2025-03-05 Thread Alexander Lobakin

"Couple" is a bit humbly... Add the following functionality to libeth:

* XDP shared queues managing
* XDP_TX bulk sending infra
* .ndo_xdp_xmit() infra
* adding buffers to &xdp_buff
* running XDP prog and managing its verdict
* completing XDP Tx buffers

Suggested-by: Maciej Fijalkowski  # lots of stuff
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/libeth/Kconfig  |   10 +-
 drivers/net/ethernet/intel/libeth/Makefile |7 +-
 include/net/libeth/types.h |  106 +-
 drivers/net/ethernet/intel/libeth/priv.h   |   26 +
 include/net/libeth/tx.h|   30 +-
 include/net/libeth/xdp.h   | 1827 
 drivers/net/ethernet/intel/libeth/tx.c |   38 +
 drivers/net/ethernet/intel/libeth/xdp.c|  431 +
 8 files changed, 2467 insertions(+), 8 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libeth/priv.h
 create mode 100644 include/net/libeth/xdp.h
 create mode 100644 drivers/net/ethernet/intel/libeth/tx.c
 create mode 100644 drivers/net/ethernet/intel/libeth/xdp.c

diff --git a/drivers/net/ethernet/intel/libeth/Kconfig 
b/drivers/net/ethernet/intel/libeth/Kconfig
index 480293b71dbc..d8c4926574fb 100644
--- a/drivers/net/ethernet/intel/libeth/Kconfig
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -1,9 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0-only
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 
 config LIBETH
-   tristate
+   tristate "Common Ethernet library (libeth)" if COMPILE_TEST
select PAGE_POOL
help
  libeth is a common library containing routines shared between several
  drivers, but not yet promoted to the generic kernel API.
+
+config LIBETH_XDP
+   tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST
+   select LIBETH
+   help
+ XDP helpers based on libeth hotpath management.
diff --git a/drivers/net/ethernet/intel/libeth/Makefile 
b/drivers/net/ethernet/intel/libeth/Makefile
index 52492b081132..51669840ee06 100644
--- a/drivers/net/ethernet/intel/libeth/Makefile
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -1,6 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 
 obj-$(CONFIG_LIBETH)   += libeth.o
 
 libeth-y   := rx.o
+libeth-y   += tx.o
+
+obj-$(CONFIG_LIBETH_XDP)   += libeth_xdp.o
+
+libeth_xdp-y   += xdp.o
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 603825e45133..cf1d78a9dc38 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -1,10 +1,32 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
 
 #ifndef __LIBETH_TYPES_H
 #define __LIBETH_TYPES_H
 
-#include 
+#include 
+
+/* Stats */
+
+/**
+ * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop
+ * @packets: received frames counter
+ * @bytes: sum of bytes of received frames above
+ * @fragments: sum of fragments of received S/G frames
+ * @hsplit: number of frames the device performed the header split for
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_rq_napi_stats {
+   union {
+   struct {
+   u32 packets;
+   u32 bytes;
+   u32 fragments;
+   u32 hsplit;
+   };
+   DECLARE_FLEX_ARRAY(u32, raw);
+   };
+};
 
 /**
  * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
@@ -22,4 +44,84 @@ struct libeth_sq_napi_stats {
};
 };
 
+/**
+ * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx
+ * completion loop
+ * @packets: completed frames counter
+ * @bytes: sum of bytes of completed frames above
+ * @fragments: sum of fragments of completed S/G frames
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_xdpsq_napi_stats {
+   union {
+   struct {
+   u32 packets;
+   u32 bytes;
+   u32 fragments;
+   };
+   DECLARE_FLEX_ARRAY(u32, raw);
+   };
+};
+
+/* XDP */
+
+/*
+ * The following structures should be embedded into driver's queue structure
+ * and passed to the libeth_xdp helpers, never used directly.
+ */
+
+/* XDPSQ sharing */
+
+/**
+ * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs
+ * @lock: spinlock for locking the queue
+ * @share: whether this particular queue is shared
+ */
+struct libeth_xdpsq_lock {
+   spinlock_t

[Intel-wired-lan] [PATCH net-next 00/16] idpf: add XDP support

2025-03-05 Thread Alexander Lobakin

Add XDP support (w/o XSk yet) to the idpf driver using the libeth_xdp
sublib, which will be then reused in at least iavf and ice.

In general, nothing outstanding comparing to ice, except performance --
let's say, up to 2x for .ndo_xdp_xmit() on certain platforms and
scenarios. libeth_xdp doesn't reinvent the wheel, mostly just
accumulates and optimizes what was already done before to stop copying
that wheel and the bugs over and over again.
idpf doesn't support VLAN Rx offload, so only the hash hint is present
for now.

Alexander Lobakin (12):
  libeth: convert to netmem
  libeth: support native XDP and register memory model
  libeth: add a couple of XDP helpers (libeth_xdp)
  libeth: add XSk helpers
  idpf: fix Rx descriptor ready check barrier in splitq
  idpf: a use saner limit for default number of queues to allocate
  idpf: link NAPIs to queues
  idpf: add support for nointerrupt queues
  idpf: use generic functions to build xdp_buff and skb
  idpf: add support for XDP on Rx
  idpf: add support for .ndo_xdp_xmit()
  idpf: add XDP RSS hash hint

Michal Kubiak (4):
  idpf: make complq cleaning dependent on scheduling mode
  idpf: remove SW marker handling from NAPI
  idpf: prepare structures to support XDP
  idpf: implement XDP_SETUP_PROG in ndo_bpf for splitq

 drivers/net/ethernet/intel/idpf/Kconfig   |2 +-
 drivers/net/ethernet/intel/libeth/Kconfig |   10 +-
 drivers/net/ethernet/intel/idpf/Makefile  |2 +
 drivers/net/ethernet/intel/libeth/Makefile|8 +-
 include/net/libeth/types.h|  106 +-
 drivers/net/ethernet/intel/idpf/idpf.h|   35 +-
 .../net/ethernet/intel/idpf/idpf_lan_txrx.h   |6 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |  126 +-
 drivers/net/ethernet/intel/idpf/xdp.h |  180 ++
 drivers/net/ethernet/intel/libeth/priv.h  |   37 +
 include/net/libeth/rx.h   |   28 +-
 include/net/libeth/tx.h   |   36 +-
 include/net/libeth/xdp.h  | 1869 +
 include/net/libeth/xsk.h  |  685 ++
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |   14 +-
 drivers/net/ethernet/intel/idpf/idpf_dev.c|   11 +-
 .../net/ethernet/intel/idpf/idpf_ethtool.c|6 +-
 drivers/net/ethernet/intel/idpf/idpf_lib.c|   29 +-
 drivers/net/ethernet/intel/idpf/idpf_main.c   |1 +
 .../ethernet/intel/idpf/idpf_singleq_txrx.c   |  111 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   |  678 +++---
 drivers/net/ethernet/intel/idpf/idpf_vf_dev.c |   11 +-
 .../net/ethernet/intel/idpf/idpf_virtchnl.c   |  113 +-
 drivers/net/ethernet/intel/idpf/xdp.c |  509 +
 drivers/net/ethernet/intel/libeth/rx.c|   40 +-
 drivers/net/ethernet/intel/libeth/tx.c|   41 +
 drivers/net/ethernet/intel/libeth/xdp.c   |  449 
 drivers/net/ethernet/intel/libeth/xsk.c   |  269 +++
 28 files changed, 4925 insertions(+), 487 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/idpf/xdp.h
 create mode 100644 drivers/net/ethernet/intel/libeth/priv.h
 create mode 100644 include/net/libeth/xdp.h
 create mode 100644 include/net/libeth/xsk.h
 create mode 100644 drivers/net/ethernet/intel/idpf/xdp.c
 create mode 100644 drivers/net/ethernet/intel/libeth/tx.c
 create mode 100644 drivers/net/ethernet/intel/libeth/xdp.c
 create mode 100644 drivers/net/ethernet/intel/libeth/xsk.c

---
Sending in one batch to introduce/show both the lib and the user.
Let me know if I'd better split.
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 04/16] libeth: add XSk helpers

2025-03-05 Thread Alexander Lobakin

Add the following counterparts of functions from libeth_xdp which need
special care on XSk path:

* building &xdp_buff (head and frags);
* running XDP prog and managing all possible verdicts;
* xmit (with S/G and metadata support);
* wakeup via CSD/IPI;
* FQ init/deinit and refilling.

Xmit by default unrolls loops by 8 when filling Tx DMA descriptors.
XDP_REDIRECT verdict is considered default/likely(). Rx frags are
considered unlikely().
It is assumed that Tx/completion queues are not mapped to any
interrupts, thus we clean them only when needed (=> 3/4 of
descriptors is busy) and keep need_wakeup set.
IPI for XSk wakeup showed better performance than triggering an SW
NIC interrupt, though it doesn't respect NIC's interrupt affinity.

Suggested-by: Maciej Fijalkowski  # lots of stuff
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/libeth/Kconfig  |   2 +-
 drivers/net/ethernet/intel/libeth/Makefile |   1 +
 drivers/net/ethernet/intel/libeth/priv.h   |  11 +
 include/net/libeth/tx.h|  10 +-
 include/net/libeth/xdp.h   |  90 ++-
 include/net/libeth/xsk.h   | 685 +
 drivers/net/ethernet/intel/libeth/tx.c |   5 +-
 drivers/net/ethernet/intel/libeth/xdp.c|  26 +-
 drivers/net/ethernet/intel/libeth/xsk.c| 269 
 9 files changed, 1067 insertions(+), 32 deletions(-)
 create mode 100644 include/net/libeth/xsk.h
 create mode 100644 drivers/net/ethernet/intel/libeth/xsk.c

diff --git a/drivers/net/ethernet/intel/libeth/Kconfig 
b/drivers/net/ethernet/intel/libeth/Kconfig
index d8c4926574fb..2445b979c499 100644
--- a/drivers/net/ethernet/intel/libeth/Kconfig
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -12,4 +12,4 @@ config LIBETH_XDP
tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST
select LIBETH
help
- XDP helpers based on libeth hotpath management.
+ XDP and XSk helpers based on libeth hotpath management.
diff --git a/drivers/net/ethernet/intel/libeth/Makefile 
b/drivers/net/ethernet/intel/libeth/Makefile
index 51669840ee06..350bc0b38bad 100644
--- a/drivers/net/ethernet/intel/libeth/Makefile
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -9,3 +9,4 @@ libeth-y+= tx.o
 obj-$(CONFIG_LIBETH_XDP)   += libeth_xdp.o
 
 libeth_xdp-y   += xdp.o
+libeth_xdp-y   += xsk.o
diff --git a/drivers/net/ethernet/intel/libeth/priv.h 
b/drivers/net/ethernet/intel/libeth/priv.h
index 1bd6e2d7a3e7..9b811d31015c 100644
--- a/drivers/net/ethernet/intel/libeth/priv.h
+++ b/drivers/net/ethernet/intel/libeth/priv.h
@@ -8,12 +8,23 @@
 
 /* XDP */
 
+enum xdp_action;
+struct libeth_xdp_buff;
+struct libeth_xdp_tx_frame;
 struct skb_shared_info;
 struct xdp_frame_bulk;
 
+extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow;
+
+void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
+  u32 count);
+u32 libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, enum xdp_action act,
+ int ret);
+
 struct libeth_xdp_ops {
void(*bulk)(const struct skb_shared_info *sinfo,
struct xdp_frame_bulk *bq, bool frags);
+   void(*xsk)(struct libeth_xdp_buff *xdp);
 };
 
 void libeth_attach_xdp(const struct libeth_xdp_ops *ops);
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index c3459917330e..c3db5c6f1641 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -12,7 +12,7 @@
 
 /**
  * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
- * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX, no action required
+ * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required
  * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
  * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
  * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
@@ -21,6 +21,8 @@
  * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
  * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats
  * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA
+ * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats
+ * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free()
  */
 enum libeth_sqe_type {
LIBETH_SQE_EMPTY= 0U,
@@ -33,6 +35,8 @@ enum libeth_sqe_type {
LIBETH_SQE_XDP_TX   = __LIBETH_SQE_XDP_START,
LIBETH_SQE_XDP_XMIT,
LIBETH_SQE_XDP_XMIT_FRAG,
+   LIBETH_SQE_XSK_TX,
+   LIBETH_SQE_XSK_TX_FRAG,
 };
 
 /**
@@ -43,6 +47,7 @@ enum libeth_sqe_type {
  * @skb: &sk_buff to consume
  * @sinfo: skb shared info of an XDP_TX frame
  * @xdpf: XDP frame from ::ndo_xdp_xmit()
+ * @xsk: XSk Rx frame from XDP_TX action
  * @dma: DMA address to unmap
  * @len: length of the mapped region to unmap
  * @nr_frags: number of frags i

[Intel-wired-lan] [PATCH net-next 01/16] libeth: convert to netmem

2025-03-05 Thread Alexander Lobakin

Back when the libeth Rx core was initially written, devmem was a draft
and netmem_ref didn't exist in the mainline. Now that it's here, make
libeth MP-agnostic before introducing any new code or any new library
users.
When it's known that the created PP/FQ is for header buffers, use faster
"unsafe" underscored netmem <--> virt accessors as netmem_is_net_iov()
is always false in that case, but consumes some cycles (bit test +
true branch).
Misc: replace explicit EXPORT_SYMBOL_NS_GPL("NS") with
DEFAULT_SYMBOL_NAMESPACE.

Signed-off-by: Alexander Lobakin 
---
 include/net/libeth/rx.h   | 22 +++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 14 
 .../ethernet/intel/idpf/idpf_singleq_txrx.c   |  2 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 33 +++
 drivers/net/ethernet/intel/libeth/rx.c| 20 ++-
 5 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index ab05024be518..7d5dc58984b1 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
 
 #ifndef __LIBETH_RX_H
 #define __LIBETH_RX_H
@@ -31,7 +31,7 @@
 
 /**
  * struct libeth_fqe - structure representing an Rx buffer (fill queue element)
- * @page: page holding the buffer
+ * @netmem: network memory reference holding the buffer
  * @offset: offset from the page start (to the headroom)
  * @truesize: total space occupied by the buffer (w/ headroom and tailroom)
  *
@@ -40,7 +40,7 @@
  * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
  */
 struct libeth_fqe {
-   struct page *page;
+   netmem_ref  netmem;
u32 offset;
u32 truesize;
 } __aligned_largest;
@@ -102,15 +102,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct 
libeth_fq_fp *fq, u32 i)
struct libeth_fqe *buf = &fq->fqes[i];
 
buf->truesize = fq->truesize;
-   buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
-   if (unlikely(!buf->page))
+   buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset,
+&buf->truesize);
+   if (unlikely(!buf->netmem))
return DMA_MAPPING_ERROR;
 
-   return page_pool_get_dma_addr(buf->page) + buf->offset +
+   return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset +
   fq->pp->p.offset;
 }
 
-void libeth_rx_recycle_slow(struct page *page);
+void libeth_rx_recycle_slow(netmem_ref netmem);
 
 /**
  * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
@@ -126,18 +127,19 @@ void libeth_rx_recycle_slow(struct page *page);
 static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
  u32 len)
 {
-   struct page *page = fqe->page;
+   netmem_ref netmem = fqe->netmem;
 
/* Very rare, but possible case. The most common reason:
 * the last fragment contained FCS only, which was then
 * stripped by the HW.
 */
if (unlikely(!len)) {
-   libeth_rx_recycle_slow(page);
+   libeth_rx_recycle_slow(netmem);
return false;
}
 
-   page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
+   page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem,
+ fqe->offset, len);
 
return true;
 }
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c 
b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 422312b8b54a..35d353d38129 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -723,7 +723,7 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) {
const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
 
-   page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false);
+   libeth_rx_recycle_slow(rx_fqes->netmem);
 
if (unlikely(++i == rx_ring->count))
i = 0;
@@ -1197,10 +1197,11 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
 const struct libeth_fqe *rx_buffer,
 unsigned int size)
 {
-   u32 hr = rx_buffer->page->pp->p.offset;
+   u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset;
 
-   skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
-   rx_buffer->offset + hr, size, rx_buffer->truesize);
+   skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags,
+  rx_buffer->netmem, rx_buffer->offset + hr,
+  size, rx_buffer->truesize);
 }
 
 /**

[Intel-wired-lan] [PATCH net-next 02/16] libeth: support native XDP and register memory model

2025-03-05 Thread Alexander Lobakin

Expand libeth's Page Pool functionality by adding native XDP support.
This means picking the appropriate headroom and DMA direction.
Also, register all the created &page_pools as XDP memory models.
A driver then can call xdp_rxq_info_attach_page_pool() when registering
its RxQ info.

Signed-off-by: Alexander Lobakin 
---
 include/net/libeth/rx.h|  6 +-
 drivers/net/ethernet/intel/libeth/rx.c | 20 +++-
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index 7d5dc58984b1..5d991404845e 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -13,8 +13,10 @@
 
 /* Space reserved in front of each frame */
 #define LIBETH_SKB_HEADROOM(NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBETH_XDP_HEADROOM(ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \
+NET_IP_ALIGN)
 /* Maximum headroom for worst-case calculations */
-#define LIBETH_MAX_HEADROOMLIBETH_SKB_HEADROOM
+#define LIBETH_MAX_HEADROOMLIBETH_XDP_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBETH_RX_LL_LEN   (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 /* Maximum supported L2-L4 header length */
@@ -66,6 +68,7 @@ enum libeth_fqe_type {
  * @count: number of descriptors/buffers the queue has
  * @type: type of the buffers this queue has
  * @hsplit: flag whether header split is enabled
+ * @xdp: flag indicating whether XDP is enabled
  * @buf_len: HW-writeable length per each buffer
  * @nid: ID of the closest NUMA node with memory
  */
@@ -81,6 +84,7 @@ struct libeth_fq {
/* Cold fields */
enum libeth_fqe_typetype:2;
boolhsplit:1;
+   boolxdp:1;
 
u32 buf_len;
int nid;
diff --git a/drivers/net/ethernet/intel/libeth/rx.c 
b/drivers/net/ethernet/intel/libeth/rx.c
index aa5d878181f7..c0be9cb043a1 100644
--- a/drivers/net/ethernet/intel/libeth/rx.c
+++ b/drivers/net/ethernet/intel/libeth/rx.c
@@ -70,7 +70,7 @@ static u32 libeth_rx_hw_len_truesize(const struct 
page_pool_params *pp,
 static bool libeth_rx_page_pool_params(struct libeth_fq *fq,
   struct page_pool_params *pp)
 {
-   pp->offset = LIBETH_SKB_HEADROOM;
+   pp->offset = fq->xdp ? LIBETH_XDP_HEADROOM : LIBETH_SKB_HEADROOM;
/* HW-writeable / syncable length per one page */
pp->max_len = LIBETH_RX_PAGE_LEN(pp->offset);
 
@@ -157,11 +157,12 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct 
napi_struct *napi)
.dev= napi->dev->dev.parent,
.netdev = napi->dev,
.napi   = napi,
-   .dma_dir= DMA_FROM_DEVICE,
};
struct libeth_fqe *fqes;
struct page_pool *pool;
-   bool ret;
+   int ret;
+
+   pp.dma_dir = fq->xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 
if (!fq->hsplit)
ret = libeth_rx_page_pool_params(fq, &pp);
@@ -175,18 +176,26 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct 
napi_struct *napi)
return PTR_ERR(pool);
 
fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid);
-   if (!fqes)
+   if (!fqes) {
+   ret = -ENOMEM;
goto err_buf;
+   }
+
+   ret = xdp_reg_page_pool(pool);
+   if (ret)
+   goto err_mem;
 
fq->fqes = fqes;
fq->pp = pool;
 
return 0;
 
+err_mem:
+   kvfree(fqes);
 err_buf:
page_pool_destroy(pool);
 
-   return -ENOMEM;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(libeth_rx_fq_create);
 
@@ -196,6 +205,7 @@ EXPORT_SYMBOL_GPL(libeth_rx_fq_create);
  */
 void libeth_rx_fq_destroy(struct libeth_fq *fq)
 {
+   xdp_unreg_page_pool(fq->pp);
kvfree(fq->fqes);
page_pool_destroy(fq->pp);
 }
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 06/16] idpf: a use saner limit for default number of queues to allocate

2025-03-05 Thread Alexander Lobakin

Currently, the maximum number of queues available for one vport is 16.
This is hardcoded, but then the function calculating the optimal number
of queues takes min(16, num_online_cpus()).
On order to be able to allocate more queues, which will be then used for
XDP, stop hardcoding 16 and rely on what the device gives us. Instead of
num_online_cpus(), which is considered suboptimal since at least 2013,
use netif_get_num_default_rss_queues() to still have free queues in the
pool.
nr_cpu_ids number of Tx queues are needed only for lockless XDP sending,
the regular stack doesn't benefit from that anyhow.
On a 128-thread Xeon, this now gives me 32 regular Tx queues and leaves
224 free for XDP (128 of which will handle XDP_TX, .ndo_xdp_xmit(), and
XSk xmit when enabled).

Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.c | 8 +---
 drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index c15833928ea1..2f221c0abad8 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -1234,13 +1234,7 @@ int idpf_vport_calc_total_qs(struct idpf_adapter 
*adapter, u16 vport_idx,
num_req_tx_qs = vport_config->user_config.num_req_tx_qs;
num_req_rx_qs = vport_config->user_config.num_req_rx_qs;
} else {
-   int num_cpus;
-
-   /* Restrict num of queues to cpus online as a default
-* configuration to give best performance. User can always
-* override to a max number of queues via ethtool.
-*/
-   num_cpus = num_online_cpus();
+   u32 num_cpus = netif_get_num_default_rss_queues();
 
dflt_splitq_txq_grps = min_t(int, max_q->max_txq, num_cpus);
dflt_singleq_txqs = min_t(int, max_q->max_txq, num_cpus);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c 
b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index 3d2413b8684f..135af3cc243f 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -937,7 +937,7 @@ int idpf_vport_alloc_max_qs(struct idpf_adapter *adapter,
max_tx_q = le16_to_cpu(caps->max_tx_q) / default_vports;
if (adapter->num_alloc_vports < default_vports) {
max_q->max_rxq = min_t(u16, max_rx_q, IDPF_MAX_Q);
-   max_q->max_txq = min_t(u16, max_tx_q, IDPF_MAX_Q);
+   max_q->max_txq = min_t(u16, max_tx_q, IDPF_LARGE_MAX_Q);
} else {
max_q->max_rxq = IDPF_MIN_Q;
max_q->max_txq = IDPF_MIN_Q;
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 11/16] idpf: prepare structures to support XDP

2025-03-05 Thread Alexander Lobakin

From: Michal Kubiak 

Extend basic structures of the driver (e.g. 'idpf_vport', 'idpf_*_queue',
'idpf_vport_user_config_data') by adding members necessary to support XDP.
Add extra XDP Tx queues needed to support XDP_TX and XDP_REDIRECT actions
without interfering with regular Tx traffic.
Also add functions dedicated to support XDP initialization for Rx and
Tx queues and call those functions from the existing algorithms of
queues configuration.

Signed-off-by: Michal Kubiak 
Co-developed-by: Alexander Lobakin 
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/Kconfig   |   2 +-
 drivers/net/ethernet/intel/idpf/Makefile  |   2 +
 drivers/net/ethernet/intel/idpf/idpf.h|  20 ++
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |  86 ++--
 drivers/net/ethernet/intel/idpf/xdp.h |  17 ++
 .../net/ethernet/intel/idpf/idpf_ethtool.c|   6 +-
 drivers/net/ethernet/intel/idpf/idpf_lib.c|  21 +-
 drivers/net/ethernet/intel/idpf/idpf_main.c   |   1 +
 .../ethernet/intel/idpf/idpf_singleq_txrx.c   |   8 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 109 +++---
 .../net/ethernet/intel/idpf/idpf_virtchnl.c   |  26 +--
 drivers/net/ethernet/intel/idpf/xdp.c | 189 ++
 12 files changed, 415 insertions(+), 72 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/idpf/xdp.h
 create mode 100644 drivers/net/ethernet/intel/idpf/xdp.c

diff --git a/drivers/net/ethernet/intel/idpf/Kconfig 
b/drivers/net/ethernet/intel/idpf/Kconfig
index 1addd663acad..7207ee4dbae8 100644
--- a/drivers/net/ethernet/intel/idpf/Kconfig
+++ b/drivers/net/ethernet/intel/idpf/Kconfig
@@ -5,7 +5,7 @@ config IDPF
tristate "Intel(R) Infrastructure Data Path Function Support"
depends on PCI_MSI
select DIMLIB
-   select LIBETH
+   select LIBETH_XDP
help
  This driver supports Intel(R) Infrastructure Data Path Function
  devices.
diff --git a/drivers/net/ethernet/intel/idpf/Makefile 
b/drivers/net/ethernet/intel/idpf/Makefile
index 2ce01a0b5898..c58abe6f8f5d 100644
--- a/drivers/net/ethernet/intel/idpf/Makefile
+++ b/drivers/net/ethernet/intel/idpf/Makefile
@@ -17,3 +17,5 @@ idpf-y := \
idpf_vf_dev.o
 
 idpf-$(CONFIG_IDPF_SINGLEQ)+= idpf_singleq_txrx.o
+
+idpf-y += xdp.o
diff --git a/drivers/net/ethernet/intel/idpf/idpf.h 
b/drivers/net/ethernet/intel/idpf/idpf.h
index 50dde09c525b..4847760744ff 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -257,6 +257,10 @@ struct idpf_port_stats {
  * @txq_model: Split queue or single queue queuing model
  * @txqs: Used only in hotpath to get to the right queue very fast
  * @crc_enable: Enable CRC insertion offload
+ * @xdpq_share: whether XDPSQ sharing is enabled
+ * @num_xdp_txq: number of XDPSQs
+ * @xdp_txq_offset: index of the first XDPSQ (== number of regular SQs)
+ * @xdp_prog: installed XDP program
  * @num_rxq: Number of allocated RX queues
  * @num_bufq: Number of allocated buffer queues
  * @rxq_desc_count: RX queue descriptor count. *MUST* have enough descriptors
@@ -303,6 +307,11 @@ struct idpf_vport {
struct idpf_tx_queue **txqs;
bool crc_enable;
 
+   bool xdpq_share;
+   u16 num_xdp_txq;
+   u16 xdp_txq_offset;
+   struct bpf_prog *xdp_prog;
+
u16 num_rxq;
u16 num_bufq;
u32 rxq_desc_count;
@@ -380,6 +389,7 @@ struct idpf_rss_data {
  *   ethtool
  * @num_req_rxq_desc: Number of user requested RX queue descriptors through
  *   ethtool
+ * @xdp_prog: requested XDP program to install
  * @user_flags: User toggled config flags
  * @mac_filter_list: List of MAC filters
  *
@@ -391,6 +401,7 @@ struct idpf_vport_user_config_data {
u16 num_req_rx_qs;
u32 num_req_txq_desc;
u32 num_req_rxq_desc;
+   struct bpf_prog *xdp_prog;
DECLARE_BITMAP(user_flags, __IDPF_USER_FLAGS_NBITS);
struct list_head mac_filter_list;
 };
@@ -604,6 +615,15 @@ static inline int idpf_is_queue_model_split(u16 q_model)
   q_model == VIRTCHNL2_QUEUE_MODEL_SPLIT;
 }
 
+/**
+ * idpf_xdp_is_prog_ena - check if there is an XDP program on adapter
+ * @vport: vport to check
+ */
+static inline bool idpf_xdp_is_prog_ena(const struct idpf_vport *vport)
+{
+   return vport->adapter && vport->xdp_prog;
+}
+
 #define idpf_is_cap_ena(adapter, field, flag) \
idpf_is_capability_ena(adapter, false, field, flag)
 #define idpf_is_cap_ena_all(adapter, field, flag) \
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index fb3b352d542e..6d9eb6f4ab38 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -7,8 +7,10 @@
 #include 
 
 #include 
-#include 
+#include 
 #include 
+#include 
+#include 
 
 #include "idpf_lan_txrx.h"
 #include "virtchnl2_lan_desc.h"
@@ -291,6 +293,

[Intel-wired-lan] [PATCH net-next 09/16] idpf: remove SW marker handling from NAPI

2025-03-05 Thread Alexander Lobakin

From: Michal Kubiak 

SW marker descriptors on completion queues are used only when a queue
is about to be destroyed. It's far from hotpath and handling it in the
hotpath NAPI poll makes no sense.
Instead, run a simple poller after a virtchnl message for destroying
the queue is sent and wait for the replies. If replies for all of the
queues are received, this means the synchronization is done correctly
and we can go forth with stopping the link.

Signed-off-by: Michal Kubiak 
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf.h|   7 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |   4 +-
 drivers/net/ethernet/intel/idpf/idpf_lib.c|   2 -
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 108 +++---
 .../net/ethernet/intel/idpf/idpf_virtchnl.c   |  34 ++
 5 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf.h 
b/drivers/net/ethernet/intel/idpf/idpf.h
index 66544faab710..6b51a5dcc1e0 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -36,6 +36,7 @@ struct idpf_vport_max_q;
 #define IDPF_NUM_CHUNKS_PER_MSG(struct_sz, chunk_sz)   \
((IDPF_CTLQ_MAX_BUF_LEN - (struct_sz)) / (chunk_sz))
 
+#define IDPF_WAIT_FOR_MARKER_TIMEO 500
 #define IDPF_MAX_WAIT  500
 
 /* available message levels */
@@ -224,13 +225,10 @@ enum idpf_vport_reset_cause {
 /**
  * enum idpf_vport_flags - Vport flags
  * @IDPF_VPORT_DEL_QUEUES: To send delete queues message
- * @IDPF_VPORT_SW_MARKER: Indicate TX pipe drain software marker packets
- *   processing is done
  * @IDPF_VPORT_FLAGS_NBITS: Must be last
  */
 enum idpf_vport_flags {
IDPF_VPORT_DEL_QUEUES,
-   IDPF_VPORT_SW_MARKER,
IDPF_VPORT_FLAGS_NBITS,
 };
 
@@ -289,7 +287,6 @@ struct idpf_port_stats {
  * @tx_itr_profile: TX profiles for Dynamic Interrupt Moderation
  * @port_stats: per port csum, header split, and other offload stats
  * @link_up: True if link is up
- * @sw_marker_wq: workqueue for marker packets
  */
 struct idpf_vport {
u16 num_txq;
@@ -332,8 +329,6 @@ struct idpf_vport {
struct idpf_port_stats port_stats;
 
bool link_up;
-
-   wait_queue_head_t sw_marker_wq;
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 9f938301b2c5..dd6cc3b5cdab 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -286,7 +286,6 @@ struct idpf_ptype_state {
  *   bit and Q_RFL_GEN is the SW bit.
  * @__IDPF_Q_FLOW_SCH_EN: Enable flow scheduling
  * @__IDPF_Q_SW_MARKER: Used to indicate TX queue marker completions
- * @__IDPF_Q_POLL_MODE: Enable poll mode
  * @__IDPF_Q_CRC_EN: enable CRC offload in singleq mode
  * @__IDPF_Q_HSPLIT_EN: enable header split on Rx (splitq)
  * @__IDPF_Q_FLAGS_NBITS: Must be last
@@ -296,7 +295,6 @@ enum idpf_queue_flags_t {
__IDPF_Q_RFL_GEN_CHK,
__IDPF_Q_FLOW_SCH_EN,
__IDPF_Q_SW_MARKER,
-   __IDPF_Q_POLL_MODE,
__IDPF_Q_CRC_EN,
__IDPF_Q_HSPLIT_EN,
 
@@ -1044,6 +1042,8 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct 
idpf_rx_queue *rxq,
  u16 cleaned_count);
 int idpf_tso(struct sk_buff *skb, struct idpf_tx_offload_params *off);
 
+void idpf_wait_for_sw_marker_completion(struct idpf_tx_queue *txq);
+
 static inline bool idpf_tx_maybe_stop_common(struct idpf_tx_queue *tx_q,
 u32 needed)
 {
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c 
b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index f3aea7bcdaa3..e17582d15e27 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -1501,8 +1501,6 @@ void idpf_init_task(struct work_struct *work)
index = vport->idx;
vport_config = adapter->vport_config[index];
 
-   init_waitqueue_head(&vport->sw_marker_wq);
-
spin_lock_init(&vport_config->mac_filter_list_lock);
 
INIT_LIST_HEAD(&vport_config->user_config.mac_filter_list);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index a240ed115e3e..4e3de6031422 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -1626,32 +1626,6 @@ int idpf_vport_queues_alloc(struct idpf_vport *vport)
return err;
 }
 
-/**
- * idpf_tx_handle_sw_marker - Handle queue marker packet
- * @tx_q: tx queue to handle software marker
- */
-static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q)
-{
-   struct idpf_netdev_priv *priv = netdev_priv(tx_q->netdev);
-   struct idpf_vport *vport = priv->vport;
-   int i;
-
-   idpf_queue_clear(SW_MARKER, tx_q);
-   /* Hardware must write marker packets to all queues associated with
-* completion queues. So check

[Intel-wired-lan] [PATCH net-next 07/16] idpf: link NAPIs to queues

2025-03-05 Thread Alexander Lobakin

Add the missing linking of NAPIs to netdev queues when enabling
interrupt vectors in order to support NAPI configuration and
interfaces requiring get_rx_queue()->napi to be set (like XSk
busy polling).

Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.c | 30 +
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 2f221c0abad8..a3f6e8cff7a0 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -3560,8 +3560,11 @@ void idpf_vport_intr_rel(struct idpf_vport *vport)
 static void idpf_vport_intr_rel_irq(struct idpf_vport *vport)
 {
struct idpf_adapter *adapter = vport->adapter;
+   bool unlock;
int vector;
 
+   unlock = rtnl_trylock();
+
for (vector = 0; vector < vport->num_q_vectors; vector++) {
struct idpf_q_vector *q_vector = &vport->q_vectors[vector];
int irq_num, vidx;
@@ -3573,8 +3576,23 @@ static void idpf_vport_intr_rel_irq(struct idpf_vport 
*vport)
vidx = vport->q_vector_idxs[vector];
irq_num = adapter->msix_entries[vidx].vector;
 
+   for (u32 i = 0; i < q_vector->num_rxq; i++)
+   netif_queue_set_napi(vport->netdev,
+q_vector->rx[i]->idx,
+NETDEV_QUEUE_TYPE_RX,
+NULL);
+
+   for (u32 i = 0; i < q_vector->num_txq; i++)
+   netif_queue_set_napi(vport->netdev,
+q_vector->tx[i]->idx,
+NETDEV_QUEUE_TYPE_TX,
+NULL);
+
kfree(free_irq(irq_num, q_vector));
}
+
+   if (unlock)
+   rtnl_unlock();
 }
 
 /**
@@ -3760,6 +3778,18 @@ static int idpf_vport_intr_req_irq(struct idpf_vport 
*vport)
   "Request_irq failed, error: %d\n", err);
goto free_q_irqs;
}
+
+   for (u32 i = 0; i < q_vector->num_rxq; i++)
+   netif_queue_set_napi(vport->netdev,
+q_vector->rx[i]->idx,
+NETDEV_QUEUE_TYPE_RX,
+&q_vector->napi);
+
+   for (u32 i = 0; i < q_vector->num_txq; i++)
+   netif_queue_set_napi(vport->netdev,
+q_vector->tx[i]->idx,
+NETDEV_QUEUE_TYPE_TX,
+&q_vector->napi);
}
 
return 0;
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 08/16] idpf: make complq cleaning dependent on scheduling mode

2025-03-05 Thread Alexander Lobakin

From: Michal Kubiak 

Extend completion queue cleaning function to support queue-based
scheduling mode needed for XDP queues.
Add 4-byte descriptor for queue-based scheduling mode and
perform some refactoring to extract the common code for
both scheduling modes.

Signed-off-by: Michal Kubiak 
Signed-off-by: Alexander Lobakin 
---
 .../net/ethernet/intel/idpf/idpf_lan_txrx.h   |   6 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |  11 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 256 +++---
 3 files changed, 177 insertions(+), 96 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_lan_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_lan_txrx.h
index 8c7f8ef8f1a1..7f12c7f2e70e 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lan_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_lan_txrx.h
@@ -186,13 +186,17 @@ struct idpf_base_tx_desc {
__le64 qw1; /* type_cmd_offset_bsz_l2tag1 */
 }; /* read used with buffer queues */
 
-struct idpf_splitq_tx_compl_desc {
+struct idpf_splitq_4b_tx_compl_desc {
/* qid=[10:0] comptype=[13:11] rsvd=[14] gen=[15] */
__le16 qid_comptype_gen;
union {
__le16 q_head; /* Queue head */
__le16 compl_tag; /* Completion tag */
} q_head_compl_tag;
+}; /* writeback used with completion queues */
+
+struct idpf_splitq_tx_compl_desc {
+   struct idpf_splitq_4b_tx_compl_desc common;
u8 ts[3];
u8 rsvd; /* Reserved */
 }; /* writeback used with completion queues */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index b029f566e57c..9f938301b2c5 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -743,7 +743,9 @@ libeth_cacheline_set_assert(struct idpf_buf_queue, 64, 24, 
32);
 
 /**
  * struct idpf_compl_queue - software structure representing a completion queue
- * @comp: completion descriptor array
+ * @comp: 8-byte completion descriptor array
+ * @comp_4b: 4-byte completion descriptor array
+ * @desc_ring: virtual descriptor ring address
  * @txq_grp: See struct idpf_txq_group
  * @flags: See enum idpf_queue_flags_t
  * @desc_count: Number of descriptors
@@ -763,7 +765,12 @@ libeth_cacheline_set_assert(struct idpf_buf_queue, 64, 24, 
32);
  */
 struct idpf_compl_queue {
__cacheline_group_begin_aligned(read_mostly);
-   struct idpf_splitq_tx_compl_desc *comp;
+   union {
+   struct idpf_splitq_tx_compl_desc *comp;
+   struct idpf_splitq_4b_tx_compl_desc *comp_4b;
+
+   void *desc_ring;
+   };
struct idpf_txq_group *txq_grp;
 
DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index a3f6e8cff7a0..a240ed115e3e 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -156,8 +156,8 @@ static void idpf_compl_desc_rel(struct idpf_compl_queue 
*complq)
return;
 
dma_free_coherent(complq->netdev->dev.parent, complq->size,
- complq->comp, complq->dma);
-   complq->comp = NULL;
+ complq->desc_ring, complq->dma);
+   complq->desc_ring = NULL;
complq->next_to_use = 0;
complq->next_to_clean = 0;
 }
@@ -284,12 +284,16 @@ static int idpf_tx_desc_alloc(const struct idpf_vport 
*vport,
 static int idpf_compl_desc_alloc(const struct idpf_vport *vport,
 struct idpf_compl_queue *complq)
 {
-   complq->size = array_size(complq->desc_count, sizeof(*complq->comp));
+   u32 desc_size;
 
-   complq->comp = dma_alloc_coherent(complq->netdev->dev.parent,
- complq->size, &complq->dma,
- GFP_KERNEL);
-   if (!complq->comp)
+   desc_size = idpf_queue_has(FLOW_SCH_EN, complq) ?
+   sizeof(*complq->comp) : sizeof(*complq->comp_4b);
+   complq->size = array_size(complq->desc_count, desc_size);
+
+   complq->desc_ring = dma_alloc_coherent(complq->netdev->dev.parent,
+  complq->size, &complq->dma,
+  GFP_KERNEL);
+   if (!complq->desc_ring)
return -ENOMEM;
 
complq->next_to_use = 0;
@@ -1921,8 +1925,46 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue 
*txq, u16 compl_tag,
 }
 
 /**
- * idpf_tx_handle_rs_completion - clean a single packet and all of its buffers
- * whether on the buffer ring or in the hash table
+ * idpf_parse_compl_desc - Parse the completion descriptor
+ * @desc: completion descriptor to be parsed
+ * @complq: completion queue containing the descriptor
+ * @txq: returns corresponding Tx queue for a given descriptor
+ * @gen_flag: current generation flag in the completion queue
+ *
+ *

[Intel-wired-lan] [PATCH net-next 14/16] idpf: add support for XDP on Rx

2025-03-05 Thread Alexander Lobakin

Use libeth XDP infra to support running XDP program on Rx polling.
This includes all of the possible verdicts/actions.
XDP Tx queues are cleaned only in "lazy" mode when there are less than
1/4 free descriptors left on the ring. libeth helper macros to define
driver-specific XDP functions make sure the compiler could uninline
them when needed.
Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
applicable. It really gives some good boosts and code size reduction
on x86_64.

Co-developed-by: Michal Kubiak 
Signed-off-by: Michal Kubiak 
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.h |   4 +-
 drivers/net/ethernet/intel/idpf/xdp.h   | 100 -
 drivers/net/ethernet/intel/idpf/idpf_lib.c  |   2 +
 drivers/net/ethernet/intel/idpf/idpf_txrx.c |  23 +--
 drivers/net/ethernet/intel/idpf/xdp.c   | 155 +++-
 5 files changed, 264 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index e36c55baf23f..5d62074c94b1 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -684,8 +684,8 @@ struct idpf_tx_queue {
__cacheline_group_end_aligned(read_mostly);
 
__cacheline_group_begin_aligned(read_write);
-   u16 next_to_use;
-   u16 next_to_clean;
+   u32 next_to_use;
+   u32 next_to_clean;
 
union {
struct {
diff --git a/drivers/net/ethernet/intel/idpf/xdp.h 
b/drivers/net/ethernet/intel/idpf/xdp.h
index a72a7638a6ea..fde85528a315 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -4,12 +4,9 @@
 #ifndef _IDPF_XDP_H_
 #define _IDPF_XDP_H_
 
-#include 
+#include 
 
-struct bpf_prog;
-struct idpf_vport;
-struct net_device;
-struct netdev_bpf;
+#include "idpf_txrx.h"
 
 int idpf_xdp_rxq_info_init_all(const struct idpf_vport *vport);
 void idpf_xdp_rxq_info_deinit_all(const struct idpf_vport *vport);
@@ -19,6 +16,99 @@ void idpf_copy_xdp_prog_to_qs(const struct idpf_vport *vport,
 int idpf_vport_xdpq_get(const struct idpf_vport *vport);
 void idpf_vport_xdpq_put(const struct idpf_vport *vport);
 
+bool idpf_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags);
+
+/**
+ * idpf_xdp_tx_xmit - produce a single HW Tx descriptor out of XDP desc
+ * @desc: XDP descriptor to pull the DMA address and length from
+ * @i: descriptor index on the queue to fill
+ * @sq: XDP queue to produce the HW Tx descriptor on
+ * @priv: &xsk_tx_metadata_ops on XSk xmit or %NULL
+ */
+static inline void idpf_xdp_tx_xmit(struct libeth_xdp_tx_desc desc, u32 i,
+   const struct libeth_xdpsq *sq, u64 priv)
+{
+   struct idpf_flex_tx_desc *tx_desc = sq->descs;
+   u32 cmd;
+
+   cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_DTYPE_M,
+IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2);
+   if (desc.flags & LIBETH_XDP_TX_LAST)
+   cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+ IDPF_TX_DESC_CMD_EOP);
+   if (priv && (desc.flags & LIBETH_XDP_TX_CSUM))
+   cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+ IDPF_TX_FLEX_DESC_CMD_CS_EN);
+
+   tx_desc = &tx_desc[i];
+   tx_desc->buf_addr = cpu_to_le64(desc.addr);
+#ifdef __LIBETH_WORD_ACCESS
+   *(u64 *)&tx_desc->qw1 = ((u64)desc.len << 48) | cmd;
+#else
+   tx_desc->qw1.buf_size = cpu_to_le16(desc.len);
+   tx_desc->qw1.cmd_dtype = cpu_to_le16(cmd);
+#endif
+}
+
+/**
+ * idpf_set_rs_bit - set RS bit on last produced descriptor
+ * @xdpq: XDP queue to produce the HW Tx descriptors on
+ */
+static inline void idpf_set_rs_bit(const struct idpf_tx_queue *xdpq)
+{
+   u32 ntu, cmd;
+
+   ntu = xdpq->next_to_use;
+   if (unlikely(!ntu))
+   ntu = xdpq->desc_count;
+
+   cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M, IDPF_TX_DESC_CMD_RS);
+#ifdef __LIBETH_WORD_ACCESS
+   *(u64 *)&xdpq->flex_tx[ntu - 1].q.qw1 |= cmd;
+#else
+   xdpq->flex_tx[ntu - 1].q.qw1.cmd_dtype |= cpu_to_le16(cmd);
+#endif
+}
+
+/**
+ * idpf_xdpq_update_tail - update the XDP Tx queue tail register
+ * @xdpq: XDP Tx queue
+ */
+static inline void idpf_xdpq_update_tail(const struct idpf_tx_queue *xdpq)
+{
+   dma_wmb();
+   writel_relaxed(xdpq->next_to_use, xdpq->tail);
+}
+
+/**
+ * idpf_xdp_tx_finalize - Update RS bit and bump XDP Tx tail
+ * @_xdpq: XDP Tx queue
+ * @sent: whether any frames were sent
+ * @flush: whether to update RS bit and the tail register
+ *
+ * This function bumps XDP Tx tail and should be called when a batch of packets
+ * has been processed in the napi loop.
+ */
+static inline void idpf_xdp_tx_finalize(void *_xdpq, bool sent, bool flush)
+{
+   struct idpf_tx_queue *xdpq = _xdpq;
+
+   if ((!flush || unlikely(!sent)) &&
+   likely(xdpq->desc_count != xdpq->pending))
+   return;
+
+

[Intel-wired-lan] [PATCH net-next 13/16] idpf: use generic functions to build xdp_buff and skb

2025-03-05 Thread Alexander Lobakin

In preparation of XDP support, move from having skb as the main frame
container during the Rx polling to &xdp_buff.
This allows to use generic and libeth helpers for building an XDP
buffer and changes the logics: now we try to allocate an skb only
when we processed all the descriptors related to the frame.
Store &libeth_xdp_stash instead of the skb pointer on the Rx queue.
It's only 8 bytes wider, but contains everything we may need.

Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |  17 +-
 .../ethernet/intel/idpf/idpf_singleq_txrx.c   | 103 ++---
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 145 +-
 3 files changed, 90 insertions(+), 175 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 38ef0db08133..e36c55baf23f 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -502,7 +502,7 @@ struct idpf_txq_stash {
  * @next_to_use: Next descriptor to use
  * @next_to_clean: Next descriptor to clean
  * @next_to_alloc: RX buffer to allocate at
- * @skb: Pointer to the skb
+ * @xdp: XDP buffer with the current frame
  * @stats_sync: See struct u64_stats_sync
  * @q_stats: See union idpf_rx_queue_stats
  * @q_id: Queue id
@@ -553,11 +553,11 @@ struct idpf_rx_queue {
__cacheline_group_end_aligned(read_mostly);
 
__cacheline_group_begin_aligned(read_write);
-   u16 next_to_use;
-   u16 next_to_clean;
-   u16 next_to_alloc;
+   u32 next_to_use;
+   u32 next_to_clean;
+   u32 next_to_alloc;
 
-   struct sk_buff *skb;
+   struct libeth_xdp_buff_stash xdp;
 
struct u64_stats_sync stats_sync;
struct idpf_rx_queue_stats q_stats;
@@ -579,8 +579,8 @@ struct idpf_rx_queue {
 libeth_cacheline_set_assert(struct idpf_rx_queue,
ALIGN(64, __alignof(struct xdp_rxq_info)) +
sizeof(struct xdp_rxq_info),
-   72 + offsetof(struct idpf_rx_queue, q_stats) -
-   offsetofend(struct idpf_rx_queue, skb),
+   88 + offsetof(struct idpf_rx_queue, q_stats) -
+   offsetofend(struct idpf_rx_queue, xdp),
32);
 
 /**
@@ -1071,9 +1071,6 @@ int idpf_config_rss(struct idpf_vport *vport);
 int idpf_init_rss(struct idpf_vport *vport);
 void idpf_deinit_rss(struct idpf_vport *vport);
 int idpf_rx_bufs_init_all(struct idpf_vport *vport);
-void idpf_rx_add_frag(struct idpf_rx_buf *rx_buf, struct sk_buff *skb,
- unsigned int size);
-struct sk_buff *idpf_rx_build_skb(const struct libeth_fqe *buf, u32 size);
 void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val,
   bool xmit_more);
 unsigned int idpf_size_to_txd_count(unsigned int size);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
index c81065b4fb24..544fe113265b 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
@@ -1,8 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2023 Intel Corporation */
 
-#include 
-#include 
+#include 
 
 #include "idpf.h"
 
@@ -780,7 +779,7 @@ static void idpf_rx_singleq_flex_hash(struct idpf_rx_queue 
*rx_q,
 }
 
 /**
- * idpf_rx_singleq_process_skb_fields - Populate skb header fields from Rx
+ * __idpf_rx_singleq_process_skb_fields - Populate skb header fields from Rx
  * descriptor
  * @rx_q: Rx ring being processed
  * @skb: pointer to current skb being populated
@@ -792,17 +791,14 @@ static void idpf_rx_singleq_flex_hash(struct 
idpf_rx_queue *rx_q,
  * other fields within the skb.
  */
 static void
-idpf_rx_singleq_process_skb_fields(struct idpf_rx_queue *rx_q,
-  struct sk_buff *skb,
-  const union virtchnl2_rx_desc *rx_desc,
-  u16 ptype)
+__idpf_rx_singleq_process_skb_fields(struct idpf_rx_queue *rx_q,
+struct sk_buff *skb,
+const union virtchnl2_rx_desc *rx_desc,
+u16 ptype)
 {
struct libeth_rx_pt decoded = rx_q->rx_ptype_lkup[ptype];
struct libeth_rx_csum csum_bits;
 
-   /* modifies the skb - consumes the enet header */
-   skb->protocol = eth_type_trans(skb, rx_q->xdp_rxq.dev);
-
/* Check if we're using base mode descriptor IDs */
if (rx_q->rxdids == VIRTCHNL2_RXDID_1_32B_BASE_M) {
idpf_rx_singleq_base_hash(rx_q, skb, rx_desc, decoded);
@@ -813,7 +809,6 @@ idpf_rx_singleq_process_skb_fields(struct idpf_rx_queue 
*rx_q,
}
 
idpf_rx_singleq_csum(rx_q, skb, csum_bits, decoded);
-   skb_record_rx_queue(skb, rx_q->idx);
 }
 
 /**
@@ -952,6 +947,32 @@ idp

[Intel-wired-lan] [PATCH net-next 15/16] idpf: add support for .ndo_xdp_xmit()

2025-03-05 Thread Alexander Lobakin

Use libeth XDP infra to implement .ndo_xdp_xmit() in idpf.
The Tx callbacks are reused from XDP_TX code. XDP redirect target
feature is set/cleared depending on the XDP prog presence, as for now
we still don't allocate XDP Tx queues when there's no program.

Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/xdp.h  |  2 ++
 drivers/net/ethernet/intel/idpf/idpf_lib.c |  1 +
 drivers/net/ethernet/intel/idpf/xdp.c  | 29 ++
 3 files changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/intel/idpf/xdp.h 
b/drivers/net/ethernet/intel/idpf/xdp.h
index fde85528a315..a2ac1b2f334f 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -110,5 +110,7 @@ static inline void idpf_xdp_tx_finalize(void *_xdpq, bool 
sent, bool flush)
 void idpf_xdp_set_features(const struct idpf_vport *vport);
 
 int idpf_xdp(struct net_device *dev, struct netdev_bpf *xdp);
+int idpf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags);
 
 #endif /* _IDPF_XDP_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c 
b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 2d1efcb854be..39b9885293a9 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -2371,4 +2371,5 @@ static const struct net_device_ops idpf_netdev_ops = {
.ndo_set_features = idpf_set_features,
.ndo_tx_timeout = idpf_tx_timeout,
.ndo_bpf = idpf_xdp,
+   .ndo_xdp_xmit = idpf_xdp_xmit,
 };
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c 
b/drivers/net/ethernet/intel/idpf/xdp.c
index abf75e840c0a..1834f217a07f 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -357,8 +357,35 @@ LIBETH_XDP_DEFINE_START();
 LIBETH_XDP_DEFINE_TIMER(static idpf_xdp_tx_timer, idpf_clean_xdp_irq);
 LIBETH_XDP_DEFINE_FLUSH_TX(idpf_xdp_tx_flush_bulk, idpf_xdp_tx_prep,
   idpf_xdp_tx_xmit);
+LIBETH_XDP_DEFINE_FLUSH_XMIT(static idpf_xdp_xmit_flush_bulk, idpf_xdp_tx_prep,
+idpf_xdp_tx_xmit);
 LIBETH_XDP_DEFINE_END();
 
+/**
+ * idpf_xdp_xmit - send frames queued by ``XDP_REDIRECT`` to this interface
+ * @dev: network device
+ * @n: number of frames to transmit
+ * @frames: frames to transmit
+ * @flags: transmit flags (``XDP_XMIT_FLUSH`` or zero)
+ *
+ * Return: number of frames successfully sent or -errno on error.
+ */
+int idpf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+ u32 flags)
+{
+   const struct idpf_netdev_priv *np = netdev_priv(dev);
+   const struct idpf_vport *vport = np->vport;
+
+   if (unlikely(!netif_carrier_ok(dev) || !vport->link_up))
+   return -ENETDOWN;
+
+   return libeth_xdp_xmit_do_bulk(dev, n, frames, flags,
+  &vport->txqs[vport->xdp_txq_offset],
+  vport->num_xdp_txq,
+  idpf_xdp_xmit_flush_bulk,
+  idpf_xdp_tx_finalize);
+}
+
 void idpf_xdp_set_features(const struct idpf_vport *vport)
 {
if (!idpf_is_queue_model_split(vport->rxq_model))
@@ -417,6 +444,8 @@ idpf_xdp_setup_prog(struct idpf_vport *vport, const struct 
netdev_bpf *xdp)
cfg->user_config.xdp_prog = old;
}
 
+   libeth_xdp_set_redirect(vport->netdev, vport->xdp_prog);
+
return ret;
 }
 
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 05/16] idpf: fix Rx descriptor ready check barrier in splitq

2025-03-05 Thread Alexander Lobakin

No idea what the current barrier position was meant for. At that point,
nothing is read from the descriptor, only the pointer to the actual one
is fetched.
The correct barrier usage here is after the generation check, so that
only the first qword is read if the descriptor is not yet ready and we
need to stop polling. Debatable on coherent DMA as the Rx descriptor
size is <= cacheline size, but anyway, the current barrier position
only makes the codegen worse.

Fixes: 3a8845af66ed ("idpf: add RX splitq napi poll support")
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 6254806c2072..c15833928ea1 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -3232,18 +3232,14 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue 
*rxq, int budget)
/* get the Rx desc from Rx queue based on 'next_to_clean' */
rx_desc = &rxq->rx[ntc].flex_adv_nic_3_wb;
 
-   /* This memory barrier is needed to keep us from reading
-* any other fields out of the rx_desc
-*/
-   dma_rmb();
-
/* if the descriptor isn't done, no work yet to do */
gen_id = le16_get_bits(rx_desc->pktlen_gen_bufq_id,
   VIRTCHNL2_RX_FLEX_DESC_ADV_GEN_M);
-
if (idpf_queue_has(GEN_CHK, rxq) != gen_id)
break;
 
+   dma_rmb();
+
rxdid = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_ADV_RXDID_M,
  rx_desc->rxdid_ucast);
if (rxdid != VIRTCHNL2_RXDID_2_FLEX_SPLITQ) {
-- 
2.48.1

[Intel-wired-lan] [PATCH net-next 12/16] idpf: implement XDP_SETUP_PROG in ndo_bpf for splitq

2025-03-05 Thread Alexander Lobakin

From: Michal Kubiak 

Implement loading/removing XDP program using .ndo_bpf callback
in the split queue mode. Reconfigure and restart the queues if needed
(!!old_prog != !!new_prog), otherwise, just update the pointers.

Signed-off-by: Michal Kubiak 
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.h |   4 +-
 drivers/net/ethernet/intel/idpf/xdp.h   |   7 ++
 drivers/net/ethernet/intel/idpf/idpf_lib.c  |   1 +
 drivers/net/ethernet/intel/idpf/idpf_txrx.c |   4 +
 drivers/net/ethernet/intel/idpf/xdp.c   | 114 
 5 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 6d9eb6f4ab38..38ef0db08133 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -485,6 +485,7 @@ struct idpf_txq_stash {
  * @desc_ring: virtual descriptor ring address
  * @bufq_sets: Pointer to the array of buffer queues in splitq mode
  * @napi: NAPI instance corresponding to this queue (splitq)
+ * @xdp_prog: attached XDP program
  * @rx_buf: See struct &libeth_fqe
  * @pp: Page pool pointer in singleq mode
  * @tail: Tail offset. Used for both queue models single and split.
@@ -525,13 +526,14 @@ struct idpf_rx_queue {
struct {
struct idpf_bufq_set *bufq_sets;
struct napi_struct *napi;
+   struct bpf_prog __rcu *xdp_prog;
};
struct {
struct libeth_fqe *rx_buf;
struct page_pool *pp;
+   void __iomem *tail;
};
};
-   void __iomem *tail;
 
DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
u16 idx;
diff --git a/drivers/net/ethernet/intel/idpf/xdp.h 
b/drivers/net/ethernet/intel/idpf/xdp.h
index 8ace8384f348..a72a7638a6ea 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -6,12 +6,19 @@
 
 #include 
 
+struct bpf_prog;
 struct idpf_vport;
+struct net_device;
+struct netdev_bpf;
 
 int idpf_xdp_rxq_info_init_all(const struct idpf_vport *vport);
 void idpf_xdp_rxq_info_deinit_all(const struct idpf_vport *vport);
+void idpf_copy_xdp_prog_to_qs(const struct idpf_vport *vport,
+ struct bpf_prog *xdp_prog);
 
 int idpf_vport_xdpq_get(const struct idpf_vport *vport);
 void idpf_vport_xdpq_put(const struct idpf_vport *vport);
 
+int idpf_xdp(struct net_device *dev, struct netdev_bpf *xdp);
+
 #endif /* _IDPF_XDP_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c 
b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 0f4edc9cd1ad..84ca8c08bd56 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -2368,4 +2368,5 @@ static const struct net_device_ops idpf_netdev_ops = {
.ndo_get_stats64 = idpf_get_stats64,
.ndo_set_features = idpf_set_features,
.ndo_tx_timeout = idpf_tx_timeout,
+   .ndo_bpf = idpf_xdp,
 };
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 97513822d614..e152fbe4ebe3 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -1102,6 +1102,8 @@ static void idpf_vport_queue_grp_rel_all(struct 
idpf_vport *vport)
  */
 void idpf_vport_queues_rel(struct idpf_vport *vport)
 {
+   idpf_copy_xdp_prog_to_qs(vport, NULL);
+
idpf_tx_desc_rel_all(vport);
idpf_rx_desc_rel_all(vport);
 
@@ -1664,6 +1666,8 @@ int idpf_vport_queues_alloc(struct idpf_vport *vport)
if (err)
goto err_out;
 
+   idpf_copy_xdp_prog_to_qs(vport, vport->xdp_prog);
+
return 0;
 
 err_out:
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c 
b/drivers/net/ethernet/intel/idpf/xdp.c
index 8770249b5abe..c0322fa7bfee 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -4,6 +4,7 @@
 #include 
 
 #include "idpf.h"
+#include "idpf_virtchnl.h"
 #include "xdp.h"
 
 static int idpf_rxq_for_each(const struct idpf_vport *vport,
@@ -115,6 +116,33 @@ void idpf_xdp_rxq_info_deinit_all(const struct idpf_vport 
*vport)
  (void *)(size_t)vport->rxq_model);
 }
 
+static int idpf_xdp_rxq_assign_prog(struct idpf_rx_queue *rxq, void *arg)
+{
+   struct mutex *lock = &rxq->q_vector->vport->adapter->vport_ctrl_lock;
+   struct bpf_prog *prog = arg;
+   struct bpf_prog *old;
+
+   if (prog)
+   bpf_prog_inc(prog);
+
+   old = rcu_replace_pointer(rxq->xdp_prog, prog, lockdep_is_held(lock));
+   if (old)
+   bpf_prog_put(old);
+
+   return 0;
+}
+
+/**
+ * idpf_copy_xdp_prog_to_qs - set pointers to XDP program for each Rx queue
+ * @vport: vport to setup XDP for
+ * @xdp_prog: XDP program that should be copied to all Rx queues

[Intel-wired-lan] [PATCH iwl-net] igc: Fix XSK queue NAPI ID mapping

2025-03-05 Thread Joe Damato

In commit b65969856d4f ("igc: Link queues to NAPI instances"), the XSK
queues were incorrectly unmapped from their NAPI instances. After
discussion on the mailing list and the introduction of a test to codify
the expected behavior, we can see that the unmapping causes the
check_xsk test to fail:

NETIF=enp86s0 ./tools/testing/selftests/drivers/net/queues.py

[...]
  # Check| ksft_eq(q.get('xsk', None), {},
  # Check failed None != {} xsk attr on queue we configured
  not ok 4 queues.check_xsk

After this commit, the test passes:

  ok 4 queues.check_xsk

Note that the test itself is only in net-next, so I tested this change
by applying it to my local net-next tree, booting, and running the test.

Cc: sta...@vger.kernel.org
Fixes: b65969856d4f ("igc: Link queues to NAPI instances")
Signed-off-by: Joe Damato 
---
 drivers/net/ethernet/intel/igc/igc_xdp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c 
b/drivers/net/ethernet/intel/igc/igc_xdp.c
index 13bbd3346e01..869815f48ac1 100644
--- a/drivers/net/ethernet/intel/igc/igc_xdp.c
+++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
@@ -86,7 +86,6 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter,
napi_disable(napi);
}
 
-   igc_set_queue_napi(adapter, queue_id, NULL);
set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
 
@@ -136,7 +135,6 @@ static int igc_xdp_disable_pool(struct igc_adapter 
*adapter, u16 queue_id)
xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR);
clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
-   igc_set_queue_napi(adapter, queue_id, napi);
 
if (needs_reset) {
napi_enable(napi);

base-commit: 3c9231ea6497dfc50ac0ef69fff484da27d0df66
-- 
2.34.1

[Intel-wired-lan] [PATCH iwl-net v3 1/1] e1000e: change k1 configuration on MTP and later platforms

2025-03-05 Thread Vitaly Lifshits

Starting from Meteor Lake, the Kumeran interface between the integrated
MAC and the I219 PHY works at a different frequency. This causes sporadic
MDI errors when accessing the PHY, and in rare circumstances could lead
to packet corruption.

To overcome this, introduce minor changes to the Kumeran idle
state (K1) parameters during device initialization. Hardware reset
reverts this configuration, therefore it needs to be applied in a few
places.

Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake")
Signed-off-by: Vitaly Lifshits 
---
v3: refactor code to avoid edge case errors
v2: address community comments
v1: initial version
---
 drivers/net/ethernet/intel/e1000e/defines.h |  3 +
 drivers/net/ethernet/intel/e1000e/ich8lan.c | 80 +++--
 drivers/net/ethernet/intel/e1000e/ich8lan.h |  4 ++
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/defines.h 
b/drivers/net/ethernet/intel/e1000e/defines.h
index 5e2cfa73f889..8294a7c4f122 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -803,4 +803,7 @@
 /* SerDes Control */
 #define E1000_GEN_POLL_TIMEOUT  640
 
+#define E1000_FEXTNVM12_PHYPD_CTRL_MASK0x00C0
+#define E1000_FEXTNVM12_PHYPD_CTRL_P1  0x0080
+
 #endif /* _E1000_DEFINES_H_ */
diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c 
b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index 2f9655cf5dd9..5a02d810c69f 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -285,6 +285,45 @@ static void e1000_toggle_lanphypc_pch_lpt(struct e1000_hw 
*hw)
}
 }
 
+/**
+ * e1000_reconfigure_k1_exit_timeout - reconfigure K1 exit timeout to
+ * align to MTP and later platform requirements.
+ * @hw: pointer to the HW structure
+ *
+ * Context: PHY semaphore must be held by caller.
+ * Return: 0 on success, negative on failure
+ */
+static s32 e1000_reconfigure_k1_exit_timeout(struct e1000_hw *hw)
+{
+   u16 phy_timeout;
+   u32 fextnvm12;
+   s32 ret_val;
+
+   if (hw->mac.type < e1000_pch_mtp)
+   return 0;
+
+   /* Change Kumeran K1 power down state from P0s to P1 */
+   fextnvm12 = er32(FEXTNVM12);
+   fextnvm12 &= E1000_FEXTNVM12_PHYPD_CTRL_MASK;
+   fextnvm12 |= E1000_FEXTNVM12_PHYPD_CTRL_P1;
+   ew32(FEXTNVM12, fextnvm12);
+
+   /* Wait for the interface the settle */
+   usleep_range(1000, 1100);
+
+   /* Change K1 exit timeout */
+   ret_val = e1e_rphy_locked(hw, I217_PHY_TIMEOUTS_REG,
+ &phy_timeout);
+   if (ret_val)
+   return ret_val;
+
+   phy_timeout &= ~I217_PHY_TIMEOUTS_K1_EXIT_TO_MASK;
+   phy_timeout |= 0xF00;
+
+   return e1e_wphy_locked(hw, I217_PHY_TIMEOUTS_REG,
+ phy_timeout);
+}
+
 /**
  *  e1000_init_phy_workarounds_pchlan - PHY initialization workarounds
  *  @hw: pointer to the HW structure
@@ -327,15 +366,22 @@ static s32 e1000_init_phy_workarounds_pchlan(struct 
e1000_hw *hw)
 * LANPHYPC Value bit to force the interconnect to PCIe mode.
 */
switch (hw->mac.type) {
+   case e1000_pch_mtp:
+   case e1000_pch_lnp:
+   case e1000_pch_ptp:
+   case e1000_pch_nvp:
+   /* At this point the PHY might be inaccessible so don't
+* propagate the failure
+*/
+   if (e1000_reconfigure_k1_exit_timeout(hw))
+   e_dbg("Failed to reconfigure K1 exit timeout\n");
+
+   fallthrough;
case e1000_pch_lpt:
case e1000_pch_spt:
case e1000_pch_cnp:
case e1000_pch_tgp:
case e1000_pch_adp:
-   case e1000_pch_mtp:
-   case e1000_pch_lnp:
-   case e1000_pch_ptp:
-   case e1000_pch_nvp:
if (e1000_phy_is_accessible_pchlan(hw))
break;
 
@@ -419,8 +465,20 @@ static s32 e1000_init_phy_workarounds_pchlan(struct 
e1000_hw *hw)
 *  the PHY is in.
 */
ret_val = hw->phy.ops.check_reset_block(hw);
-   if (ret_val)
+   if (ret_val) {
e_err("ME blocked access to PHY after reset\n");
+   goto out;
+   }
+
+   if (hw->mac.type >= e1000_pch_mtp) {
+   ret_val = hw->phy.ops.acquire(hw);
+   if (ret_val) {
+   e_err("Failed to reconfigure K1 exit 
timeout\n");
+   goto out;
+   }
+   ret_val = e1000_reconfigure_k1_exit_timeout(hw);
+   hw->phy.ops.release(hw);
+   }
}
 
 out:
@@ -4888,6 +4946,18 @@ static s32 e1000_init_hw_ich8lan(struct e1000_hw *hw)
u16 i;
 
e1000_initialize_hw_bits_ich8lan(hw);
+   if (hw->mac.type >= e1000

Re: [Intel-wired-lan] [PATCH iwl-next v1] ice: refactor the Tx scheduler feature

2025-03-05 Thread Szapar-Mudlaw, Martyna





On 3/4/2025 2:43 PM, Szapar-Mudlaw, Martyna wrote:



On 3/3/2025 10:54 AM, Simon Horman wrote:

On Wed, Feb 26, 2025 at 12:33:56PM +0100, Mateusz Polchlopek wrote:

Embed ice_get_tx_topo_user_sel() inside the only caller:
ice_devlink_tx_sched_layers_get().
Instead of jump from the wrapper to the function that does "get" 
operation

it does "get" itself.

Remove unnecessary comment and make usage of str_enabled_disabled()
in ice_init_tx_topology().


Hi Mateusz,

These changes seem reasonable to me.
But I wonder if they could be motivated in the commit message.

What I mean is, the commit message explains what has been done.
But I think it should explain why it has been done.


Hi Simon,

I'm replying on behalf of Mateusz since he's on leave, and we didn't 
want to keep this issue waiting too long.

Would such extended commit message make sense and address your concerns?

"Simplify the code by eliminating an unnecessary wrapper function. 
Previously, ice_devlink_tx_sched_layers_get() acted as a thin wrapper 
around ice_get_tx_topo_user_sel(), adding no real value but increasing 
code complexity. Since both functions were only used once, the wrapper 
was redundant and contributed approximately 20 lines of unnecessary 
code. By directly calling ice_get_tx_topo_user_sel(), improve 
readability and reduce function jumps, without altering functionality.
Also remove unnecessary comment and make usage of str_enabled_disabled() 
in ice_init_tx_topology()."


Thank you,
Martyna


Sorry, I caused some confusion in the previous version of proposed 
commit message. Here’s the corrected one:


"Simplify the code by eliminating an unnecessary wrapper function. 
Previously, ice_devlink_tx_sched_layers_get() acted as a thin wrapper 
around ice_get_tx_topo_user_sel(), adding no real value but increasing 
code complexity. Since both functions were only used once, the wrapper 
was redundant and contributed approximately 20 lines of unnecessary 
code. Remove ice_get_tx_topo_user_sel() and moves its instructions 
directly into ice_devlink_tx_sched_layers_get(), improving readability 
and reducing function jumps, without altering functionality.
Also remove unnecessary comment and make usage of str_enabled_disabled() 
in ice_init_tx_topology()."







Suggested-by: Marcin Szycik 
Reviewed-by: Michal Swiatkowski 
Reviewed-by: Jedrzej Jagielski 
Reviewed-by: Przemek Kitszel 
Reviewed-by: Aleksandr Loktionov 
Signed-off-by: Mateusz Polchlopek 


...

Re: [Intel-wired-lan] [PATCH iwl-next v7 5/9] igc: Add support for frame preemption verification

2025-03-05 Thread Abdul Rahim, Faizal





On 4/3/2025 11:26 pm, Vladimir Oltean wrote:

On Mon, Mar 03, 2025 at 05:26:54AM -0500, Faizal Rahim wrote:

+static inline bool igc_fpe_is_verify_or_response(union igc_adv_rx_desc 
*rx_desc,
+unsigned int size)
+{
+   u32 status_error = le32_to_cpu(rx_desc->wb.upper.status_error);
+   int smd;
+
+   smd = FIELD_GET(IGC_RXDADV_STAT_SMD_TYPE_MASK, status_error);
+
+   return (smd == IGC_RXD_STAT_SMD_TYPE_V || smd == IGC_RXD_STAT_SMD_TYPE_R) 
&&
+   size == SMD_FRAME_SIZE;
+}


The NIC should explicitly not respond to frames which have an SMD-V but
are not "verify" mPackets (7 octets of 0x55 + 1 octet SMD-V + 60 octets
of 0x00 + mCRC - as per 802.3 definitions). Similarly, it should only
treat SMD-R frames which contain 7 octets of 0x55 + 1 octet SMD-R + 60
octets of 0x00 + mCRC as "respond" mPackets, and only advance its
verification state machine based on those.

Specifically, it doesn't look like you are ensuring the packet payload
contains 60 octets of zeroes. Is this something that the hardware
already does for you, or is it something that needs further validation
and differentiation in software?


The hardware doesn’t handle this, so the igc driver have to do it manually. 
I missed this handling, and Chwee Lin also noticed the issue while testing 
this patch series—it wasn’t rejecting SMD-V and SMD-R with a non-zero 
payload. I’ll update this patch to include the fix that Chwee Lin 
implemented and tested. Thanks.

[Intel-wired-lan] [PATCH iwl-next v8 11/11] igc: add support to get frame preemption statistics via ethtool

2025-03-05 Thread Faizal Rahim

Implemented "ethtool --include-statistics --show-mm" callback for IGC.

Tested preemption scenario to check preemption statistics:
1) Trigger verification handshake on both boards:
$ sudo ethtool --set-mm enp1s0 pmac-enabled on
$ sudo ethtool --set-mm enp1s0 tx-enabled on
$ sudo ethtool --set-mm enp1s0 verify-enabled on
2) Set preemptible or express queue in taprio for tx board:
$ sudo tc qdisc replace dev enp1s0 parent root handle 100 taprio \
  num_tc 4 map 3 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \
  queues 1@0 1@1 1@2 1@3 base-time 0 sched-entry S F 10 \
  fp E E P P
3) Send large size packets on preemptible queue
4) Send small size packets on express queue to preempt packets in
   preemptible queue
5) Show preemption statistics on the receiving board:
   $ ethtool --include-statistics --show-mm enp1s0
 MAC Merge layer state for enp1s0:
 pMAC enabled: on
 TX enabled: on
 TX active: on
 TX minimum fragment size: 64
 RX minimum fragment size: 60
 Verify enabled: on
 Verify time: 128
 Max verify time: 128
 Verification status: SUCCEEDED
 Statistics:
  MACMergeFrameAssErrorCount: 0
  MACMergeFrameSmdErrorCount: 0
  MACMergeFrameAssOkCount: 511
  MACMergeFragCountRx: 764
  MACMergeFragCountTx: 0
  MACMergeHoldCount: 0

Co-developed-by: Vinicius Costa Gomes 
Signed-off-by: Vinicius Costa Gomes 
Co-developed-by: Chwee-Lin Choong 
Signed-off-by: Chwee-Lin Choong 
Signed-off-by: Faizal Rahim 
---
 drivers/net/ethernet/intel/igc/igc_ethtool.c | 39 
 drivers/net/ethernet/intel/igc/igc_regs.h| 16 
 2 files changed, 55 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c 
b/drivers/net/ethernet/intel/igc/igc_ethtool.c
index fd4b4b332309..1ed08a3fa78b 100644
--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
+++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
@@ -1819,6 +1819,44 @@ static int igc_ethtool_set_mm(struct net_device *netdev,
return igc_tsn_offload_apply(adapter);
 }
 
+/**
+ * igc_ethtool_get_frame_ass_error - Get the frame assembly error count.
+ * @reg_value: Register value for IGC_PRMEXCPRCNT
+ * Return: The count of frame assembly errors.
+ */
+static u64 igc_ethtool_get_frame_ass_error(u32 reg_value)
+{
+   u32 ooo_frame_cnt, ooo_frag_cnt; /* Out of order statistics */
+   u32 miss_frame_frag_cnt;
+
+   ooo_frame_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAME_CNT, reg_value);
+   ooo_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAG_CNT, reg_value);
+   miss_frame_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_MISS_FRAME_FRAG_CNT, 
reg_value);
+
+   return ooo_frame_cnt + ooo_frag_cnt + miss_frame_frag_cnt;
+}
+
+static u64 igc_ethtool_get_frame_smd_error(u32 reg_value)
+{
+   return FIELD_GET(IGC_PRMEXCPRCNT_OOO_SMDC, reg_value);
+}
+
+static void igc_ethtool_get_mm_stats(struct net_device *dev,
+struct ethtool_mm_stats *stats)
+{
+   struct igc_adapter *adapter = netdev_priv(dev);
+   struct igc_hw *hw = &adapter->hw;
+   u32 reg_value;
+
+   reg_value = rd32(IGC_PRMEXCPRCNT);
+
+   stats->MACMergeFrameAssErrorCount = 
igc_ethtool_get_frame_ass_error(reg_value);
+   stats->MACMergeFrameSmdErrorCount = 
igc_ethtool_get_frame_smd_error(reg_value);
+   stats->MACMergeFrameAssOkCount = rd32(IGC_PRMPTDRCNT);
+   stats->MACMergeFragCountRx = rd32(IGC_PRMEVNTRCNT);
+   stats->MACMergeFragCountTx = rd32(IGC_PRMEVNTTCNT);
+}
+
 static int igc_ethtool_get_link_ksettings(struct net_device *netdev,
  struct ethtool_link_ksettings *cmd)
 {
@@ -2116,6 +2154,7 @@ static const struct ethtool_ops igc_ethtool_ops = {
.self_test  = igc_ethtool_diag_test,
.get_mm = igc_ethtool_get_mm,
.set_mm = igc_ethtool_set_mm,
+   .get_mm_stats   = igc_ethtool_get_mm_stats,
 };
 
 void igc_ethtool_set_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h 
b/drivers/net/ethernet/intel/igc/igc_regs.h
index 12ddc5793651..f343c6bfc6be 100644
--- a/drivers/net/ethernet/intel/igc/igc_regs.h
+++ b/drivers/net/ethernet/intel/igc/igc_regs.h
@@ -222,6 +222,22 @@
 
 #define IGC_FTQF(_n)   (0x059E0 + (4 * (_n)))  /* 5-tuple Queue Fltr */
 
+/* Time sync registers - preemption statistics */
+#define IGC_PRMPTDRCNT 0x04284 /* Good RX Preempted Packets */
+#define IGC_PRMEVNTTCNT0x04298 /* TX Preemption event counter 
*/
+#define IGC_PRMEVNTRCNT0x0429C /* RX Preemption event counter 
*/
+
+ /* Preemption Exception Counter */
+ #define IGC_PRMEXCPRCNT   0x42A0
+/* Received out of order packets with SMD-C */
+#define IGC_PRMEXCPRCNT_OOO_SMDC   0x00FF
+/* Received out of order packets with SMD-C and wrong Frame CNT */
+#define IGC_PRMEXCPRCNT_OOO_FRAME_CNT  0xF

[Intel-wired-lan] [PATCH iwl-next v8 00/11] igc: Add support for Frame Preemption feature in IGC

2025-03-05 Thread Faizal Rahim

Introduces support for the FPE feature in the IGC driver.

The patches aligns with the upstream FPE API:
https://patchwork.kernel.org/project/netdevbpf/cover/20230220122343.1156614-1-vladimir.olt...@nxp.com/
https://patchwork.kernel.org/project/netdevbpf/cover/20230119122705.73054-1-vladimir.olt...@nxp.com/

It builds upon earlier work:
https://patchwork.kernel.org/project/netdevbpf/cover/20220520011538.109-1-vinicius.go...@intel.com/

The patch series adds the following functionalities to the IGC driver:
a) Configure FPE using `ethtool --set-mm`.
b) Display FPE settings via `ethtool --show-mm`.
c) View FPE statistics using `ethtool --include-statistics --show-mm'.
e) Block setting preemptible tc in taprio since it is not supported yet.
   Existing code already blocks it in mqprio.

Tested:
Enabled CONFIG_PROVE_LOCKING, CONFIG_DEBUG_ATOMIC_SLEEP, CONFIG_DMA_API_DEBUG, 
and CONFIG_KASAN
1) selftests
2) netdev down/up cycles
3) suspend/resume cycles
4) fpe verification

No bugs or unusual dmesg logs were observed.
Ran 1), 2) and 3) with and without the patch series, compared dmesg and 
selftest logs — no differences found.

Change Log:
v7 -> v8:
- Reject SMD-V and SMD-R if the payload contains non-zero values (Vladimir, 
Chwee Lin)
- Move resetting verification state when link is down to a new patch 3/11 
(Vladimir)
- Move frag_size related handling outside of spin_lock_irq_save(), patch 1/11 
(Vladimir)
- Renamed IGC_PRMEXPRCNT to IGC_PRMEXCPRCNT, to align with i226 SW User Manual, 
patch 11/11 (Chwee Lin)
- Use IGC_PRMEXPRCNT_OOO_SMDC for frame smd errors instead of frame assembly 
errors, 11/11 (Chwee Lin)

v6 -> v7:
- Squash the cpu param to the prev line (Przemek Kitszel)
- Use igc_ prefix for fpe_t (Przemek Kitszel)
- Move new ops to different line in igc_ethtool_ops (Przemek Kitszel)
- Documentation for igc_enable_empty_addr_recv (): rx -> Rx (Przemek Kitszel)
- Documentation for igc_enable_empty_addr_recv (): s/IGC/the driver/ (Przemek 
Kitszel)
- Change preferred style of init, from { }, to {} (Przemek Kitszel)
- Remove inclusion of umbrella header  in igc_tsn.c (Przemek 
Kitszel)
- End enum with "," in igc_txd_popts_type (Przemek Kitszel)
- Remove unnecessary braces in igc_fpe_is_verify_or_response() (Przemek Kitszel)

v5 -> v6:
- Added Tested-by: Furong Xu for patch 1/9 (Vladimir, Furong Xu)
- Updated logic in ethtool_mmsv_link_state_handle() (Vladimir, Furong Xu)
- Swap sequence of function call in stmmac_set_mm() (Furong Xu)
- Log an error if igc_enable_empty_addr_recv() fails (Vladimir)
- Move the patch ".. Block setting preemptible traffic .." before ".. Add 
support to get MAC Merge data .." (Vladimir)
- Move mmsv function kernel-doc from .h to .c file (Vladimir)

v4 -> v5:
- Remove "igc: Add support for preemptible traffic class in taprio" patch 
(Vladimir)
- Add a new patch "igc: Block setting preemptible traffic classes in taprio" 
(Vladimir)
- Add kernel-doc for mmsv api (Vladimir)
- olininfo_status to use host byte order (Simon)
- status_error should host byte type (Simon)
- Some code was misplaced in the wrong patch (Vladimir)
- Mix of tabs and spaces in patch description (Vladimir)
- Created igc_is_pmac_enabled() to reduce code repetition (Vladimir)

v3 -> v4:
- Fix compilation warnings introduced by this patch series

v2 -> v3:
- Implement configure_tx() mmsv callback (Vladimir)
- Use static_branch_inc() and static_branch_dec() (Vladimir)
- Add adapter->fpe.mmsv.pmac_enabled as extra check (Vladimir)
- Remove unnecessary error check in igc_fpe_init_tx_descriptor() (Vladimir)
- Additional places to use FIELD_PREP() instead of manual bit manipulation 
(Vladimir)
- IGC_TXD_POPTS_SMD_V and IGC_TXD_POPTS_SMD_R type change to enum (Vladimir)
- Remove unnecessary netif_running() check in igc_fpe_xmit_frame (Vladimir)
- Rate limit print in igc_fpe_send_mpacket (Vladimir)

v1 -> v2:
- Extract the stmmac verification logic into a common library (Vladimir)
- igc to use common library for verification (Vladimir)
- Fix syntax for kernel-doc to use "Return:" (Vladimir)
- Use FIELD_GET instead of manual bit masking (Vladimir)
- Don't assign 0 to statistics counter in igc_ethtool_get_mm_stats() (Vladimir)
- Use pmac-enabled as a condition to allow MAC address value 0 (Vladimir)
- Define macro register value in increasing value order (Vladimir)
- Fix tx-min-frag-size handling for igc (Vladimir)
- Handle link state changes with verification in igc (Vladimir)
- Add static key for fast path code (Vladimir)
- rx_min_frag_size get from constant (Vladimir)

v1: 
https://patchwork.kernel.org/project/netdevbpf/cover/20241216064720.931522-1-faizal.abdul.ra...@linux.intel.com/
v2: 
https://patchwork.kernel.org/project/netdevbpf/cover/20250205100524.1138523-1-faizal.abdul.ra...@linux.intel.com/
v3: 
https://patchwork.kernel.org/project/netdevbpf/cover/20250207165649.2245320-1-faizal.abdul.ra...@linux.intel.com/
v4: 
https://patchwork.kernel.org/project/netdevbpf/cover/20250210070207.2615418-1-faizal.abdul.ra

Re: [Intel-wired-lan] [PATCH net-next v9 2/6] net: ena: use napi's aRFS rmap notifers

2025-03-05 Thread Ahmed Zaki





On 2025-03-04 11:33 p.m., Arinzon, David wrote:

[RE-SEND] I just realized I sent this only to iwl, sorry for spamming.


On 2025-03-03 10:11 a.m., Arinzon, David wrote:

Use the core's rmap notifiers and delete our own.

Acked-by: David Arinzon 
Signed-off-by: Ahmed Zaki 
---
   drivers/net/ethernet/amazon/ena/ena_netdev.c | 43 +---
   1 file changed, 1 insertion(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index c1295dfad0d0..6aab85a7c60a 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -5,9 +5,6 @@

   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

-#ifdef CONFIG_RFS_ACCEL
-#include 
-#endif /* CONFIG_RFS_ACCEL */
   #include 
   #include 
   #include 
@@ -162,30 +159,6 @@ int ena_xmit_common(struct ena_adapter

*adapter,

  return 0;
   }

-static int ena_init_rx_cpu_rmap(struct ena_adapter *adapter) -{
-#ifdef CONFIG_RFS_ACCEL
-   u32 i;
-   int rc;
-
-   adapter->netdev->rx_cpu_rmap = alloc_irq_cpu_rmap(adapter-

num_io_queues);

-   if (!adapter->netdev->rx_cpu_rmap)
-   return -ENOMEM;
-   for (i = 0; i < adapter->num_io_queues; i++) {
-   int irq_idx = ENA_IO_IRQ_IDX(i);
-
-   rc = irq_cpu_rmap_add(adapter->netdev->rx_cpu_rmap,
- pci_irq_vector(adapter->pdev, irq_idx));
-   if (rc) {
-   free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
-   adapter->netdev->rx_cpu_rmap = NULL;
-   return rc;
-   }
-   }
-#endif /* CONFIG_RFS_ACCEL */
-   return 0;
-}
-
   static void ena_init_io_rings_common(struct ena_adapter *adapter,
   struct ena_ring *ring, u16 qid)
{ @@ -1596,7 +1569,7 @@ static int ena_enable_msix(struct ena_adapter

*adapter)

  adapter->num_io_queues = irq_cnt - ENA_ADMIN_MSIX_VEC;
  }

-   if (ena_init_rx_cpu_rmap(adapter))
+   if (netif_enable_cpu_rmap(adapter->netdev,
+ adapter->num_io_queues))
  netif_warn(adapter, probe, adapter->netdev,
 "Failed to map IRQs to CPUs\n");

@@ -1742,13 +1715,6 @@ static void ena_free_io_irq(struct ena_adapter
*adapter)
  struct ena_irq *irq;
  int i;

-#ifdef CONFIG_RFS_ACCEL
-   if (adapter->msix_vecs >= 1) {
-   free_irq_cpu_rmap(adapter->netdev->rx_cpu_rmap);
-   adapter->netdev->rx_cpu_rmap = NULL;
-   }
-#endif /* CONFIG_RFS_ACCEL */
-
  for (i = ENA_IO_IRQ_FIRST_IDX; i <
ENA_MAX_MSIX_VEC(io_queue_count); i++) {
  irq = &adapter->irq_tbl[i];
  irq_set_affinity_hint(irq->vector, NULL); @@
-4131,13 +4097,6 @@ static void __ena_shutoff(struct pci_dev *pdev,

bool shutdown)

  ena_dev = adapter->ena_dev;
  netdev = adapter->netdev;

-#ifdef CONFIG_RFS_ACCEL
-   if ((adapter->msix_vecs >= 1) && (netdev->rx_cpu_rmap)) {
-   free_irq_cpu_rmap(netdev->rx_cpu_rmap);
-   netdev->rx_cpu_rmap = NULL;
-   }
-
-#endif /* CONFIG_RFS_ACCEL */
  /* Make sure timer and reset routine won't be called after
   * freeing device resources.
   */
--
2.43.0


Hi Ahmed,

After the merging of this patch, I see the below stack trace when the IRQs

are freed.

It can be reproduced by unloading and loading the driver using
`modprobe -r ena; modprobe ena` (happens during unload)

Based on the patchset and the changes to other drivers, I think
there's a missing call to the function that releases the affinity
notifier (The warn is in
https://web.git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.gi
t/tree/kernel/irq/manage.c#n2031)

I saw in the intel code in the patchset that ` netif_napi_set_irq(, -1);`

is added?


After adding the code snippet I don't see this anymore, but I want to

understand whether it's the right call by design.

Yes, in ena_down() the IRQs are freed before napis are deleted (where IRQ
notifiers are released). The code below is fine (and is better IMO) but you
can also delete napis then free IRQs.




Thanks for the clarification. Some book-keeping, as this change fixes the issue.
The need to use `netif_napi_set_irq` was introduced in 
https://lore.kernel.org/netdev/20241002001331.65444-2-jdam...@fastly.com/,
But, technically, there was not need to use the call with the -1 until the 
introduction of this patch.
Is my understanding correct?


Correct. The new patch attaches resources (IRQ notifieres) to the napi 
instance that should be released before freeing IRQs.




If it's correct, then the fix is for this patch.

(Also adding Joe who authored the mentioned patch)



I guess so since there was no need to call set_irq(-1) previoulsy.

[Intel-wired-lan] [PATCH net-next 10/16] idpf: add support for nointerrupt queues

2025-03-05 Thread Alexander Lobakin

Currently, queues are associated 1:1 with interrupt vectors as it's
assumed queues are always interrupt-driven.
In order to use a queue without an interrupt, idpf still needs to have
a vector assigned to it to flush descriptors. This vector can be global
and only one for the whole vport to handle all its noirq queues.
Always request one excessive vector and configure it in non-interrupt
mode right away when creating vport, so that it can be used later by
queues when needed.

Co-developed-by: Michal Kubiak 
Signed-off-by: Michal Kubiak 
Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/idpf.h|  8 +++
 drivers/net/ethernet/intel/idpf/idpf_txrx.h   |  4 ++
 drivers/net/ethernet/intel/idpf/idpf_dev.c| 11 +++-
 drivers/net/ethernet/intel/idpf/idpf_lib.c|  2 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c   |  8 +++
 drivers/net/ethernet/intel/idpf/idpf_vf_dev.c | 11 +++-
 .../net/ethernet/intel/idpf/idpf_virtchnl.c   | 53 +--
 7 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf.h 
b/drivers/net/ethernet/intel/idpf/idpf.h
index 6b51a5dcc1e0..50dde09c525b 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -281,6 +281,9 @@ struct idpf_port_stats {
  * @num_q_vectors: Number of IRQ vectors allocated
  * @q_vectors: Array of queue vectors
  * @q_vector_idxs: Starting index of queue vectors
+ * @noirq_dyn_ctl: register to enable/disable the vector for NOIRQ queues
+ * @noirq_dyn_ctl_ena: value to write to the above to enable it
+ * @noirq_v_idx: ID of the NOIRQ vector
  * @max_mtu: device given max possible MTU
  * @default_mac_addr: device will give a default MAC to use
  * @rx_itr_profile: RX profiles for Dynamic Interrupt Moderation
@@ -322,6 +325,11 @@ struct idpf_vport {
u16 num_q_vectors;
struct idpf_q_vector *q_vectors;
u16 *q_vector_idxs;
+
+   void __iomem *noirq_dyn_ctl;
+   u32 noirq_dyn_ctl_ena;
+   u16 noirq_v_idx;
+
u16 max_mtu;
u8 default_mac_addr[ETH_ALEN];
u16 rx_itr_profile[IDPF_DIM_PROFILE_SLOTS];
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index dd6cc3b5cdab..fb3b352d542e 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -57,6 +57,8 @@
 /* Default vector sharing */
 #define IDPF_MBX_Q_VEC 1
 #define IDPF_MIN_Q_VEC 1
+/* Data vector for NOIRQ queues */
+#define IDPF_RESERVED_VECS 1
 
 #define IDPF_DFLT_TX_Q_DESC_COUNT  512
 #define IDPF_DFLT_TX_COMPLQ_DESC_COUNT 512
@@ -288,6 +290,7 @@ struct idpf_ptype_state {
  * @__IDPF_Q_SW_MARKER: Used to indicate TX queue marker completions
  * @__IDPF_Q_CRC_EN: enable CRC offload in singleq mode
  * @__IDPF_Q_HSPLIT_EN: enable header split on Rx (splitq)
+ * @__IDPF_Q_NOIRQ: queue is polling-driven and has no interrupt
  * @__IDPF_Q_FLAGS_NBITS: Must be last
  */
 enum idpf_queue_flags_t {
@@ -297,6 +300,7 @@ enum idpf_queue_flags_t {
__IDPF_Q_SW_MARKER,
__IDPF_Q_CRC_EN,
__IDPF_Q_HSPLIT_EN,
+   __IDPF_Q_NOIRQ,
 
__IDPF_Q_FLAGS_NBITS,
 };
diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c 
b/drivers/net/ethernet/intel/idpf/idpf_dev.c
index 41e4bd49402a..5f177933b55c 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c
@@ -73,7 +73,7 @@ static int idpf_intr_reg_init(struct idpf_vport *vport)
int num_vecs = vport->num_q_vectors;
struct idpf_vec_regs *reg_vals;
int num_regs, i, err = 0;
-   u32 rx_itr, tx_itr;
+   u32 rx_itr, tx_itr, val;
u16 total_vecs;
 
total_vecs = idpf_get_reserved_vecs(vport->adapter);
@@ -117,6 +117,15 @@ static int idpf_intr_reg_init(struct idpf_vport *vport)
intr->tx_itr = idpf_get_reg_addr(adapter, tx_itr);
}
 
+   /* Data vector for NOIRQ queues */
+
+   val = reg_vals[vport->q_vector_idxs[i] - IDPF_MBX_Q_VEC].dyn_ctl_reg;
+   vport->noirq_dyn_ctl = idpf_get_reg_addr(adapter, val);
+
+   val = PF_GLINT_DYN_CTL_WB_ON_ITR_M | PF_GLINT_DYN_CTL_INTENA_MSK_M |
+ FIELD_PREP(PF_GLINT_DYN_CTL_ITR_INDX_M, IDPF_NO_ITR_UPDATE_IDX);
+   vport->noirq_dyn_ctl_ena = val;
+
 free_reg_vals:
kfree(reg_vals);
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c 
b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index e17582d15e27..2594ca38e8ca 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -1126,7 +1126,7 @@ static struct idpf_vport *idpf_vport_alloc(struct 
idpf_adapter *adapter,
vport->default_vport = adapter->num_alloc_vports <
   idpf_get_default_vports(adapter);
 
-   num_max_q = max(max_q->max_txq, max_q->max_rxq);
+   num_max_q = max(max_q->max_txq, max_q->max_r

[Intel-wired-lan] [PATCH net-next 16/16] idpf: add XDP RSS hash hint

2025-03-05 Thread Alexander Lobakin

Add &xdp_metadata_ops with a callback to get RSS hash hint from the
descriptor. Declare the splitq 32-byte descriptor as 4 u64s to parse
them more efficiently when possible.

Signed-off-by: Alexander Lobakin 
---
 drivers/net/ethernet/intel/idpf/xdp.h | 64 +++
 drivers/net/ethernet/intel/idpf/xdp.c | 28 +++-
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/idpf/xdp.h 
b/drivers/net/ethernet/intel/idpf/xdp.h
index a2ac1b2f334f..52783a5c8e0f 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -107,6 +107,70 @@ static inline void idpf_xdp_tx_finalize(void *_xdpq, bool 
sent, bool flush)
libeth_xdpsq_unlock(&xdpq->xdp_lock);
 }
 
+struct idpf_xdp_rx_desc {
+   aligned_u64 qw0;
+#define IDPF_XDP_RX_BUFQ   BIT_ULL(47)
+#define IDPF_XDP_RX_GENBIT_ULL(46)
+#define IDPF_XDP_RX_LENGENMASK_ULL(45, 32)
+#define IDPF_XDP_RX_PT GENMASK_ULL(25, 16)
+
+   aligned_u64 qw1;
+#define IDPF_XDP_RX_BUFGENMASK_ULL(47, 32)
+#define IDPF_XDP_RX_EOPBIT_ULL(1)
+
+   aligned_u64 qw2;
+#define IDPF_XDP_RX_HASH   GENMASK_ULL(31, 0)
+
+   aligned_u64 qw3;
+} __aligned(4 * sizeof(u64));
+static_assert(sizeof(struct idpf_xdp_rx_desc) ==
+ sizeof(struct virtchnl2_rx_flex_desc_adv_nic_3));
+
+#define idpf_xdp_rx_bufq(desc) !!((desc)->qw0 & IDPF_XDP_RX_BUFQ)
+#define idpf_xdp_rx_gen(desc)  !!((desc)->qw0 & IDPF_XDP_RX_GEN)
+#define idpf_xdp_rx_len(desc)  FIELD_GET(IDPF_XDP_RX_LEN, (desc)->qw0)
+#define idpf_xdp_rx_pt(desc)   FIELD_GET(IDPF_XDP_RX_PT, (desc)->qw0)
+#define idpf_xdp_rx_buf(desc)  FIELD_GET(IDPF_XDP_RX_BUF, (desc)->qw1)
+#define idpf_xdp_rx_eop(desc)  !!((desc)->qw1 & IDPF_XDP_RX_EOP)
+#define idpf_xdp_rx_hash(desc) FIELD_GET(IDPF_XDP_RX_HASH, (desc)->qw2)
+
+static inline void
+idpf_xdp_get_qw0(struct idpf_xdp_rx_desc *desc,
+const struct virtchnl2_rx_flex_desc_adv_nic_3 *rxd)
+{
+#ifdef __LIBETH_WORD_ACCESS
+   desc->qw0 = ((const typeof(desc))rxd)->qw0;
+#else
+   desc->qw0 = ((u64)le16_to_cpu(rxd->pktlen_gen_bufq_id) << 32) |
+   ((u64)le16_to_cpu(rxd->ptype_err_fflags0) << 16);
+#endif
+}
+
+static inline void
+idpf_xdp_get_qw1(struct idpf_xdp_rx_desc *desc,
+const struct virtchnl2_rx_flex_desc_adv_nic_3 *rxd)
+{
+#ifdef __LIBETH_WORD_ACCESS
+   desc->qw1 = ((const typeof(desc))rxd)->qw1;
+#else
+   desc->qw1 = ((u64)le16_to_cpu(rxd->buf_id) << 32) |
+   rxd->status_err0_qw1;
+#endif
+}
+
+static inline void
+idpf_xdp_get_qw2(struct idpf_xdp_rx_desc *desc,
+const struct virtchnl2_rx_flex_desc_adv_nic_3 *rxd)
+{
+#ifdef __LIBETH_WORD_ACCESS
+   desc->qw2 = ((const typeof(desc))rxd)->qw2;
+#else
+   desc->qw2 = ((u64)rxd->hash3 << 24) |
+   ((u64)rxd->ff2_mirrid_hash2.hash2 << 16) |
+   le16_to_cpu(rxd->hash1);
+#endif
+}
+
 void idpf_xdp_set_features(const struct idpf_vport *vport);
 
 int idpf_xdp(struct net_device *dev, struct netdev_bpf *xdp);
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c 
b/drivers/net/ethernet/intel/idpf/xdp.c
index 1834f217a07f..b0b4b785bf8e 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -386,12 +386,38 @@ int idpf_xdp_xmit(struct net_device *dev, int n, struct 
xdp_frame **frames,
   idpf_xdp_tx_finalize);
 }
 
+static int idpf_xdpmo_rx_hash(const struct xdp_md *ctx, u32 *hash,
+ enum xdp_rss_hash_type *rss_type)
+{
+   const struct libeth_xdp_buff *xdp = (typeof(xdp))ctx;
+   const struct idpf_rx_queue *rxq;
+   struct idpf_xdp_rx_desc desc;
+   struct libeth_rx_pt pt;
+
+   rxq = libeth_xdp_buff_to_rq(xdp, typeof(*rxq), xdp_rxq);
+
+   idpf_xdp_get_qw0(&desc, xdp->desc);
+
+   pt = rxq->rx_ptype_lkup[idpf_xdp_rx_pt(&desc)];
+   if (!libeth_rx_pt_has_hash(rxq->xdp_rxq.dev, pt))
+   return -ENODATA;
+
+   idpf_xdp_get_qw2(&desc, xdp->desc);
+
+   return libeth_xdpmo_rx_hash(hash, rss_type, idpf_xdp_rx_hash(&desc),
+   pt);
+}
+
+static const struct xdp_metadata_ops idpf_xdpmo = {
+   .xmo_rx_hash= idpf_xdpmo_rx_hash,
+};
+
 void idpf_xdp_set_features(const struct idpf_vport *vport)
 {
if (!idpf_is_queue_model_split(vport->rxq_model))
return;
 
-   libeth_xdp_set_features_noredir(vport->netdev);
+   libeth_xdp_set_features_noredir(vport->netdev, &idpf_xdpmo);
 }
 
 /**
-- 
2.48.1

Re: [Intel-wired-lan] [PATCH iwl-net] igc: Fix XSK queue NAPI ID mapping

2025-03-05 Thread Gerhard Engleder


On 05.03.25 19:09, Joe Damato wrote:

In commit b65969856d4f ("igc: Link queues to NAPI instances"), the XSK
queues were incorrectly unmapped from their NAPI instances. After
discussion on the mailing list and the introduction of a test to codify
the expected behavior, we can see that the unmapping causes the
check_xsk test to fail:

NETIF=enp86s0 ./tools/testing/selftests/drivers/net/queues.py

[...]
   # Check| ksft_eq(q.get('xsk', None), {},
   # Check failed None != {} xsk attr on queue we configured
   not ok 4 queues.check_xsk

After this commit, the test passes:

   ok 4 queues.check_xsk

Note that the test itself is only in net-next, so I tested this change
by applying it to my local net-next tree, booting, and running the test.

Cc: sta...@vger.kernel.org
Fixes: b65969856d4f ("igc: Link queues to NAPI instances")
Signed-off-by: Joe Damato 
---
  drivers/net/ethernet/intel/igc/igc_xdp.c | 2 --
  1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c 
b/drivers/net/ethernet/intel/igc/igc_xdp.c
index 13bbd3346e01..869815f48ac1 100644
--- a/drivers/net/ethernet/intel/igc/igc_xdp.c
+++ b/drivers/net/ethernet/intel/igc/igc_xdp.c
@@ -86,7 +86,6 @@ static int igc_xdp_enable_pool(struct igc_adapter *adapter,
napi_disable(napi);
}
  
-	igc_set_queue_napi(adapter, queue_id, NULL);

set_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
set_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
  
@@ -136,7 +135,6 @@ static int igc_xdp_disable_pool(struct igc_adapter *adapter, u16 queue_id)

xsk_pool_dma_unmap(pool, IGC_RX_DMA_ATTR);
clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &rx_ring->flags);
clear_bit(IGC_RING_FLAG_AF_XDP_ZC, &tx_ring->flags);
-   igc_set_queue_napi(adapter, queue_id, napi);
  
  	if (needs_reset) {

napi_enable(napi);

base-commit: 3c9231ea6497dfc50ac0ef69fff484da27d0df66


igc_set_queue_napi() could be made static as it only used within
igc_main.c after this change.

Reviewed-by: Gerhard Engleder

Re: [Intel-wired-lan] [PATCH iwl-next v8 01/11] net: stmmac: move frag_size handling out of spin_lock

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:16AM -0500, Faizal Rahim wrote:
> The upcoming patch will extract verification logic into a new module,
> MMSV (MAC Merge Software Verification). MMSV will handle most FPE fields,
> except frag_size. It introduces its own lock (mmsv->lock), replacing
> fpe_cfg->lock.
> 
> Since frag_size handling remains in the driver, the existing rtnl_lock()
> is sufficient. Move frag_size handling out of spin_lock_irq_save() to keep
> the upcoming patch a pure refactoring without behavior changes.
> 
> Signed-off-by: Faizal Rahim 
> ---

Reviewed-by: Vladimir Oltean

Re: [Intel-wired-lan] [PATCH iwl-next v8 03/11] net: ethtool: mm: reset verification status when link is down

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:18AM -0500, Faizal Rahim wrote:
> When the link partner goes down, "ethtool --show-mm" still displays
> "Verification status: SUCCEEDED," reflecting a previous state that is
> no longer valid.
> 
> Reset the verification status to ensure it reflects the current state.
> 
> Signed-off-by: Faizal Rahim 
> ---
>  net/ethtool/mm.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
> index aa43df2ecac0..ad9b40034003 100644
> --- a/net/ethtool/mm.c
> +++ b/net/ethtool/mm.c
> @@ -415,8 +415,9 @@ void ethtool_mmsv_link_state_handle(struct ethtool_mmsv 
> *mmsv, bool up)
>   /* New link => maybe new partner => new verification process */
>   ethtool_mmsv_apply(mmsv);
>   } else {
> - mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
> - mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;

This is not what I requested. The lines with "-" here should have never
been introduced by patch 02/11 in the first place.

> + /* Reset the reported verification state while the link is down 
> */
> + if (mmsv->verify_enabled)
> + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
>  
>   /* No link or pMAC not enabled */
>   ethtool_mmsv_configure_pmac(mmsv, false);
> -- 
> 2.34.1
>

Re: [Intel-wired-lan] [PATCH iwl-next v8 05/11] igc: optimize the TX packet buffer utilization

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:20AM -0500, Faizal Rahim wrote:
> Packet buffers (RX + TX) total 64KB. Neither RX or TX buffers can be
> larger than 34KB. So divide the buffer equally, 32KB for each.
> 
> Co-developed-by: Vinicius Costa Gomes 
> Signed-off-by: Vinicius Costa Gomes 
> Signed-off-by: Faizal Rahim 
> ---
>  drivers/net/ethernet/intel/igc/igc_defines.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
> b/drivers/net/ethernet/intel/igc/igc_defines.h
> index 8e449904aa7d..516ef70c98e9 100644
> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
> @@ -400,7 +400,8 @@
>  #define I225_TXPBSIZE_DEFAULT0x0414 /* TXPBSIZE default */
>  #define IGC_RXPBS_CFG_TS_EN  0x8000 /* Timestamp in Rx buffer */
>  
> -#define IGC_TXPBSIZE_TSN 0x04145145 /* 5k bytes buffer for each queue */
> + /* 7KB bytes buffer for each tx queue (total 4 queues) + 4KB for BMC*/

Strange formatting here: space before "/*" but no space before "*/"?

> +#define IGC_TXPBSIZE_TSN 0x041c71c7
>  
>  #define IGC_DTXMXPKTSZ_TSN   0x19 /* 1600 bytes of max TX DMA packet size */
>  #define IGC_DTXMXPKTSZ_DEFAULT   0x98 /* 9728-byte Jumbo frames */
> -- 
> 2.34.1
>

Re: [Intel-wired-lan] [PATCH iwl-next v8 06/11] igc: set the RX packet buffer size for TSN mode

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:21AM -0500, Faizal Rahim wrote:
> In preparation for supporting frame preemption, when entering TSN mode
> set the receive packet buffer to 16KB for the Express MAC, 16KB for
> the Preemptible MAC and 2KB for the BMC, according to the datasheet
> section 7.1.3.2.
> 
> Co-developed-by: Vinicius Costa Gomes 
> Signed-off-by: Vinicius Costa Gomes 
> Signed-off-by: Faizal Rahim 
> ---
>  drivers/net/ethernet/intel/igc/igc_defines.h |  3 +++
>  drivers/net/ethernet/intel/igc/igc_tsn.c | 13 +++--
>  2 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h 
> b/drivers/net/ethernet/intel/igc/igc_defines.h
> index 516ef70c98e9..b19ac6f30dac 100644
> --- a/drivers/net/ethernet/intel/igc/igc_defines.h
> +++ b/drivers/net/ethernet/intel/igc/igc_defines.h
> @@ -402,6 +402,9 @@
>  
>   /* 7KB bytes buffer for each tx queue (total 4 queues) + 4KB for BMC*/
>  #define IGC_TXPBSIZE_TSN 0x041c71c7
> +/* 15KB for EXP + 15KB for BE + 2KB for BMC */
> +#define IGC_RXPBSIZE_TSN 0xf08f
> +#define IGC_RXPBSIZE_SIZE_MASK   0x0001

Does 0xf08f have any further meaning, does it represent anything
bitwise? (similar question for IGC_TXPBSIZE_TSN in the previous patch).
I don't see the correlation between the values mentioned in the comment
and the magic constant. If RXPBPSIZE has a bitwise meaning, maybe you
could rewrite the magic value with macros for each field.

Re: [Intel-wired-lan] [PATCH iwl-next v1] ice: add E830 Earliest TxTime First Offload support

2025-03-05 Thread Tony Nguyen





On 2/27/2025 3:13 AM, Paul Greenwalt wrote:

E830 supports Earliest TxTime First (ETF) hardware offload, which is
configured via the ETF Qdisc (see tc-etf(8)). ETF introduces a new Tx flow
mechanism that utilizes a timestamp ring (tstamp_ring) alongside the
standard Tx ring. This timestamp ring is used to indicate when hardware
will transmit a packet.

The allocation and initialization of the timestamp ring occur when the
feature is enabled via tc-etf. Since the timestamp ring and Tx ring are
tightly coupled, both must be configured simultaneously.

To support ETF, the following flags are introduced:

  - ICE_F_TXTIME: Device feature flag set for E830 NICs, indicating ETF
support.
  - ICE_FLAG_TXTIME: PF-level flag indicating whether ETF is enabled on any
Tx queue. It is checked during ring allocation to determine if timestamp
rings should be allocated and is also referenced when modifying queue
count via ethtool -G.
  - ICE_TX_FLAGS_TXTIME: Per-ring flag set when ETF is enabled and cleared
when disabled for a specific Tx queue. It helps determine ETF status
when transmitting timestamped packets and is used by ice_is_txtime_ena()
to check if ETF is enabled on any Tx queue.

Due to a hardware issue that can result in a malicious driver detection
event, additional timestamp descriptors are required when wrapping the
timestamp ring. Up to 64 additional timestamp descriptors are reserved,
reducing the available Tx descriptors.

To accommodate this, ICE_MAX_NUM_DESC_BY_MAC is introduced, defining:

  - E830: Maximum Tx descriptor length of 8096 (8K - 32 - 64 for timestamp
fetch descriptors).
  - E810 and E82X: Maximum Tx descriptor length of 8160 (8K - 32) .


This doesn't apply.

Also, some comments from a glance over this.

...


+ice_setup_tx_ctx(struct ice_tx_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 
pf_q)
+{
+   struct ice_vsi *vsi = ring->vsi;
+   struct ice_hw *hw = &vsi->back->hw;


RCT

...


  static int
-ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring,
-   struct ice_aqc_add_tx_qgrp *qg_buf)
+ice_vsi_cfg_txq(const struct ice_vsi *vsi, struct ice_tx_ring *ring,
+   struct ice_tx_ring *tstamp_ring,
+   struct ice_aqc_add_tx_qgrp *qg_buf,
+   struct ice_aqc_set_txtime_qgrp *txtime_qg_buf)
  {
u8 buf_len = struct_size(qg_buf, txqs, 1);
struct ice_tlan_ctx tlan_ctx = { 0 };
@@ -947,6 +1047,27 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring 
*ring,
if (pf_q == le16_to_cpu(txq->txq_id))
ring->txq_teid = le32_to_cpu(txq->q_teid);
  
+	if (tstamp_ring) {

+   u8 txtime_buf_len = struct_size(txtime_qg_buf, txtimeqs, 1);
+   struct ice_txtime_ctx txtime_ctx = { 0 };


IIRC, preference is to initialize without the '0', {}


+
+   ice_setup_txtime_ctx(tstamp_ring, &txtime_ctx,
+!!(ring->flags & ICE_TX_FLAGS_TXTIME));
+   ice_pack_txtime_ctx(&txtime_ctx,
+   &txtime_qg_buf->txtimeqs[0].txtime_ctx);
+
+   tstamp_ring->tail =
+hw->hw_addr + E830_GLQTX_TXTIME_DBELL_LSB(pf_q);
+
+   status = ice_aq_set_txtimeq(hw, pf_q, 1, txtime_qg_buf,
+   txtime_buf_len, NULL);
+   if (status) {
+   dev_err(ice_pf_to_dev(pf), "Failed to set Tx Time queue 
context, error: %d\n",
+   status);
+   return status;
+   }
+   }
+
return 0;
  }
  


...


+int
+ice_aq_set_txtimeq(struct ice_hw *hw, u16 txtimeq, u8 q_count,
+  struct ice_aqc_set_txtime_qgrp *txtime_qg, u16 buf_size,
+  struct ice_sq_cd *cd)
+{
+   struct ice_aqc_set_txtimeqs *cmd;
+   struct ice_aq_desc desc;
+   u16 size;
+
+   cmd = &desc.params.set_txtimeqs;
+
+   ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_txtimeqs);
+
+   if (!txtime_qg)
+   return -EINVAL;
+
+   if (txtimeq > ICE_TXTIME_MAX_QUEUE || q_count < 1 ||
+   q_count > ICE_SET_TXTIME_MAX_Q_AMOUNT)
+   return -EINVAL;


Should the bailout conditions be checked first?


+
+   size = struct_size(txtime_qg, txtimeqs, q_count);
+


No newline here

+   if (buf_size != size)
+   return -EINVAL;
+
+   desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+
+   cmd->q_id = cpu_to_le16(txtimeq);
+   cmd->q_amount = cpu_to_le16(q_count);
+   return ice_aq_send_cmd(hw, &desc, txtime_qg, buf_size, cd);
+}
+
+/**
+ * ice_aq_ena_dis_txtimeq - enable/disable Tx time queue
+ * @hw: pointer to the hardware structure
+ * @txtimeq: first Tx time queue id to configure
+ * @q_count: number of queues to configure
+ * @q_ena: enable/disable Tx time queue
+ * @txtime_qg: holds the first Tx time queue that failed enable/disable on
+ * response
+ *

Re: [Intel-wired-lan] [PATCH net-next 01/16] libeth: convert to netmem

2025-03-05 Thread Mina Almasry

On Wed, Mar 5, 2025 at 8:23 AM Alexander Lobakin
 wrote:
>
> Back when the libeth Rx core was initially written, devmem was a draft
> and netmem_ref didn't exist in the mainline. Now that it's here, make
> libeth MP-agnostic before introducing any new code or any new library
> users.
> When it's known that the created PP/FQ is for header buffers, use faster
> "unsafe" underscored netmem <--> virt accessors as netmem_is_net_iov()
> is always false in that case, but consumes some cycles (bit test +
> true branch).
> Misc: replace explicit EXPORT_SYMBOL_NS_GPL("NS") with
> DEFAULT_SYMBOL_NAMESPACE.
>
> Signed-off-by: Alexander Lobakin 
> ---
>  include/net/libeth/rx.h   | 22 +++--
>  drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 14 
>  .../ethernet/intel/idpf/idpf_singleq_txrx.c   |  2 +-
>  drivers/net/ethernet/intel/idpf/idpf_txrx.c   | 33 +++
>  drivers/net/ethernet/intel/libeth/rx.c| 20 ++-
>  5 files changed, 51 insertions(+), 40 deletions(-)
>
> diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
> index ab05024be518..7d5dc58984b1 100644
> --- a/include/net/libeth/rx.h
> +++ b/include/net/libeth/rx.h
> @@ -1,5 +1,5 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
> -/* Copyright (C) 2024 Intel Corporation */
> +/* Copyright (C) 2024-2025 Intel Corporation */
>
>  #ifndef __LIBETH_RX_H
>  #define __LIBETH_RX_H
> @@ -31,7 +31,7 @@
>
>  /**
>   * struct libeth_fqe - structure representing an Rx buffer (fill queue 
> element)
> - * @page: page holding the buffer
> + * @netmem: network memory reference holding the buffer
>   * @offset: offset from the page start (to the headroom)
>   * @truesize: total space occupied by the buffer (w/ headroom and tailroom)
>   *
> @@ -40,7 +40,7 @@
>   * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
>   */
>  struct libeth_fqe {
> -   struct page *page;
> +   netmem_ref  netmem;
> u32 offset;
> u32 truesize;
>  } __aligned_largest;
> @@ -102,15 +102,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct 
> libeth_fq_fp *fq, u32 i)
> struct libeth_fqe *buf = &fq->fqes[i];
>
> buf->truesize = fq->truesize;
> -   buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
> -   if (unlikely(!buf->page))
> +   buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset,
> +&buf->truesize);
> +   if (unlikely(!buf->netmem))
> return DMA_MAPPING_ERROR;
>
> -   return page_pool_get_dma_addr(buf->page) + buf->offset +
> +   return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset +
>fq->pp->p.offset;
>  }
>
> -void libeth_rx_recycle_slow(struct page *page);
> +void libeth_rx_recycle_slow(netmem_ref netmem);
>
>  /**
>   * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
> @@ -126,18 +127,19 @@ void libeth_rx_recycle_slow(struct page *page);
>  static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
>   u32 len)
>  {
> -   struct page *page = fqe->page;
> +   netmem_ref netmem = fqe->netmem;
>
> /* Very rare, but possible case. The most common reason:
>  * the last fragment contained FCS only, which was then
>  * stripped by the HW.
>  */
> if (unlikely(!len)) {
> -   libeth_rx_recycle_slow(page);
> +   libeth_rx_recycle_slow(netmem);

I think before this patch this would have expanded to:

page_pool_put_full_page(pool, page, true);

But now I think it expands to:

page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);

Is the switch from true to false intentional? Is this a slow path so
it doesn't matter?

> return false;
> }
>
> -   page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
> +   page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem,
> + fqe->offset, len);
>
> return true;
>  }
> diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c 
> b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> index 422312b8b54a..35d353d38129 100644
> --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
> @@ -723,7 +723,7 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
> for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) {
> const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
>
> -   page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false);
> +   libeth_rx_recycle_slow(rx_fqes->netmem);
>
> if (unlikely(++i == rx_ring->count))
> i = 0;
> @@ -1197,10 +1197,11 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
>  const

Re: [Intel-wired-lan] MMIO write access to an invalid page in i40e_clear_hw()

2025-03-05 Thread Kyungwook Boo

On 25. 3. 5. 19:13, Loktionov, Aleksandr wrote:
> 
> 
>> -Original Message-
>> From: Intel-wired-lan  On Behalf Of
>> Kyungwook Boo
>> Sent: Monday, March 3, 2025 11:20 AM
>> To: Nguyen, Anthony L ; Kitszel, Przemyslaw
>> 
>> Cc: intel-wired-...@lists.osuosl.org; linux-ker...@vger.kernel.org
>> Subject: [Intel-wired-lan] MMIO write access to an invalid page in
>> i40e_clear_hw()
>>
> Please start commit title with 'fix' to explicitly tell what your patch do 
> i.e. :
> Ice: fix MMIO write access to an invalid page in  i40e_clear_hw
> 
> Please add Fixes: tag  
> https://www.kernel.org/doc/html/latest/process/submitting-patches.html


Thanks for the guidance. I'll follow your advice and send the patch accordingly.
I'll also read the linked documentation and will try the patch is properly
formatted.

Re: [Intel-wired-lan] [PATCH iwl-next v8 07/11] igc: add support for frame preemption verification

2025-03-05 Thread Vladimir Oltean

On Wed, Mar 05, 2025 at 08:00:22AM -0500, Faizal Rahim wrote:
> b) configure_pmac() -> not used
>- this callback dynamically controls pmac_enabled at runtime. For
>  example, mmsv calls configure_pmac() and disables pmac_enabled when
>  the link partner goes down, even if the user previously enabled it.
>  The intention is to save power but it is not feasible in igc
>  because it causes an endless adapter reset loop:
> 
>1) Board A and Board B complete the verification handshake. Tx mode
>   register for both boards are in TSN mode.
>2) Board B link goes down.
> 
>On Board A:
>3) mmsv calls configure_pmac() with pmac_enabled = false.
>4) configure_pmac() in igc updates a new field based on pmac_enabled.
>   Driver uses this field in igc_tsn_new_flags() to indicate that the
>   user enabled/disabled FPE.
>5) configure_pmac() in igc calls igc_tsn_offload_apply() to check
>   whether an adapter reset is needed. Calls existing logic in
>   igc_tsn_will_tx_mode_change() and igc_tsn_new_flags().
>6) Since pmac_enabled is now disabled and no other TSN feature is
>   active, igc_tsn_will_tx_mode_change() evaluates to true because Tx
>   mode will switch from TSN to Legacy.
>7) Driver resets the adapter.
>8) Registers are set, and Tx mode switches to Legacy.
>9) When link partner is up, steps 3–8 repeat, but this time with
>   pmac_enabled = true, reactivating TSN.
>   igc_tsn_will_tx_mode_change() evaluates to true again, since Tx
>   mode will switch from Legacy to TSN.
>   10) Driver resets the adapter.
>   11) Rest adapter completes, registers are set, and Tx mode switches to

s/Rest adapter/Adapter reset/

>   TSN.
> 
>   On Board B:
>   12) Adapter reset on Board A at step 10 causes it to detect its link
>   partner as down.
>   13) Repeats steps 3–8.
>   14) Once reset adapter on Board A is completed at step 11, it detects
>   its link partner as up.
>   15) Repeats steps 9–11.
> 
>- this cycle repeats indefinitely. To avoid this issue, igc only uses
>  mmsv.pmac_enabled to track whether FPE is enabled or disabled.
> 
> Co-developed-by: Vinicius Costa Gomes 
> Signed-off-by: Vinicius Costa Gomes 
> Co-developed-by: Choong Yong Liang 
> Signed-off-by: Choong Yong Liang 
> Co-developed-by: Chwee-Lin Choong 
> Signed-off-by: Chwee-Lin Choong 
> Signed-off-by: Faizal Rahim 
> ---
> +static inline bool igc_fpe_is_pmac_enabled(struct igc_adapter *adapter)
> +{
> + return static_branch_unlikely(&igc_fpe_enabled) &&
> +adapter->fpe.mmsv.pmac_enabled;
> +}
> +
> +static inline bool igc_fpe_is_verify_or_response(union igc_adv_rx_desc 
> *rx_desc,
> +  unsigned int size, void 
> *pktbuf)
> +{
> + u32 status_error = le32_to_cpu(rx_desc->wb.upper.status_error);
> + static const u8 zero_payload[SMD_FRAME_SIZE] = {0};
> + int smd;
> +
> + smd = FIELD_GET(IGC_RXDADV_STAT_SMD_TYPE_MASK, status_error);
> +
> + return (smd == IGC_RXD_STAT_SMD_TYPE_V || smd == 
> IGC_RXD_STAT_SMD_TYPE_R) &&
> + size == SMD_FRAME_SIZE &&
> + !memcmp(pktbuf, zero_payload, SMD_FRAME_SIZE); /* Buffer is all 
> zeros */

Using this definition...

> +}
> +
> +static inline void igc_fpe_lp_event_status(union igc_adv_rx_desc *rx_desc,
> +struct ethtool_mmsv *mmsv)
> +{
> + u32 status_error = le32_to_cpu(rx_desc->wb.upper.status_error);
> + int smd;
> +
> + smd = FIELD_GET(IGC_RXDADV_STAT_SMD_TYPE_MASK, status_error);
> +
> + if (smd == IGC_RXD_STAT_SMD_TYPE_V)
> + ethtool_mmsv_event_handle(mmsv, 
> ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET);
> + else if (smd == IGC_RXD_STAT_SMD_TYPE_R)
> + ethtool_mmsv_event_handle(mmsv, 
> ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET);
> +}
> @@ -2617,6 +2617,15 @@ static int igc_clean_rx_irq(struct igc_q_vector 
> *q_vector, const int budget)
>   size -= IGC_TS_HDR_LEN;
>   }
>  
> + if (igc_fpe_is_pmac_enabled(adapter) &&
> + igc_fpe_is_verify_or_response(rx_desc, size, pktbuf)) {

... invalid SMD-R and SMD-V frames will skip this code block altogether, and
will be passed up the network stack, and visible at least in tcpdump, correct?
Essentially, if the link partner would craft an ICMP request packet with
an SMD-V or SMD-R, your station would respond to it, which is incorrect.

A bit strange, the behavior in this case seems a bit under-specified in
the standard, and I don't see any counter that should be incremented.

> + igc_fpe_lp_event_status(rx_desc, &adapter->fpe.mmsv);
> + /* Advance the ring next-to-clean */
> + igc_is_non_eop(rx_ring, rx_desc);
> + cleaned_count++;
> + continue;
> + }

To fix this, don't you want to merge the unnaturally split
igc_f

Re: [Intel-wired-lan] MMIO write access to an invalid page in i40e_clear_hw()

2025-03-05 Thread Kyungwook Boo

On 25. 3. 5. 21:11, Loktionov, Aleksandr wrote:
> 
> 
>> -Original Message-
>> From: Intel-wired-lan  On Behalf Of
>> Przemek Kitszel
>> Sent: Wednesday, March 5, 2025 11:27 AM
>> To: Kyungwook Boo 
>> Cc: intel-wired-...@lists.osuosl.org; linux-ker...@vger.kernel.org; Nguyen,
>> Anthony L 
>> Subject: Re: [Intel-wired-lan] MMIO write access to an invalid page in
>> i40e_clear_hw()
>>
>> On 3/3/25 11:19, Kyungwook Boo wrote:
>>> Hello,
>>>
>>> It seems that there are invalid page MMIO write access in
>>> i40e_clear_hw()
>>
>> Hi,
>>
>> is this something that actually occurred, or just a theoretical bug?
>> (depending on that we will apply it to different tree)
>>
>> please send a proper patch anyway, as it looks legit to don't go bananas when
>> HW gives you 0
>>
>> (and CC netdev instead of generic kernel ML, perhaps that's the reason this
>> mail was tagged as spam for me)

Hi,

thank you for your response.

If you’re asking whether this happened on a physical device, it did not, but it
is something that can be triggered through virtual device with fuzzed input.

Also, I'll follow your guidance when sending the proper patch.

>>> diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c
>>> b/drivers/net/ethernet/intel/i40e/i40e_common.c
>>> index 370b4bddee44..97ef79be39b3 100644
>>> --- a/drivers/net/ethernet/intel/i40e/i40e_common.c
>>> +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
>>> @@ -848,19 +848,25 @@ void i40e_clear_hw(struct i40e_hw *hw)
>>> /* stop all the interrupts */
>>> wr32(hw, I40E_PFINT_ICR0_ENA, 0);
>>> val = 0x3 << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT;
>>> -   for (i = 0; i < num_pf_int - 2; i++)
>>> -   wr32(hw, I40E_PFINT_DYN_CTLN(i), val);
>>> +   if (num_pf_int > 1) {
>>
>> instead of adding if conditions, I would simply change the type to be signed

I’ll incorporate the suggested approach when sending the patch.

Best,
Kyungwook Boo

60 matches

Mail list logo