On Thu, Jun 04, 2026 at 11:40:30AM -0700, Jacob Keller wrote:
> On 6/2/2026 1:24 PM, Dipayaan Roy wrote:
> > On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
> > allocation in the RX refill path can cause 15-20% throughput
> > regression under high connection counts (>16 TCP streams).
> > 
> > Add an ethtool private flag "full-page-rx" that allows the user to
> > force one RX buffer per page, bypassing the page_pool fragment path.
> > This restores line-rate (180+ Gbps) performance on affected platforms.
> > 
> > Usage:
> >   ethtool --set-priv-flags eth0 full-page-rx on
> > 
> > There is no behavioral change by default. The flag must be explicitly
> > enabled by the user or udev rule.
> > 
> > The existing single-buffer-per-page logic for XDP and jumbo frames is
> > consolidated into a new helper mana_use_single_rxbuf_per_page() which
> > is now the single decision point for both the automatic and
> > user-controlled paths.
> > 
> > Signed-off-by: Dipayaan Roy <[email protected]>
> > ---
> 
> I had one or two minor nits, but nothing that I think really deserves a
> v11. The only real comment is a future "gotcha" that could happen if you
> ever added a second private flag, which seems unlikely and maybe not
> worth dealing with until it matters.
> 
> Reviewed-by: Jacob Keller <[email protected]>
>

Hi Jacob,

Thank you for the review.
I will keep this patch as is, since no plans for any new private flags.

Regards
Dipayaan Roy

> >  drivers/net/ethernet/microsoft/mana/mana_en.c |  22 +++-
> >  .../ethernet/microsoft/mana/mana_ethtool.c    | 103 ++++++++++++++++++
> >  include/net/mana/mana.h                       |   8 ++
> >  3 files changed, 131 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c 
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index db14357d3732..447cecfd3f67 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, 
> > dma_addr_t *da)
> >     return va;
> >  }
> >  
> > +static bool
> > +mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
> > +{
> > +   /* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
> > +    * in the RX refill path (~2kB buffer) can cause significant throughput
> > +    * regression under high connection counts. Allow user to force one RX
> > +    * buffer per page via ethtool private flag to bypass the fragment
> > +    * path.
> > +    */
> > +   if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
> > +           return true;
> > +
> > +   /* For xdp and jumbo frames make sure only one packet fits per page. */
> > +   if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
> > +           return true;
> 
> Technically you could combine all three into one if, but I agree that
> clarity and space for the comment about why the private flag exists
> makes sense.
> 
> > +
> > +   return false;
> > +}
> > +
> >  /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
> >  static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
> >                            int mtu, u32 *datasize, u32 *alloc_size,
> > @@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context 
> > *apc,
> >     /* Calculate datasize first (consistent across all cases) */
> >     *datasize = mtu + ETH_HLEN;
> >  
> > -   /* For xdp and jumbo frames make sure only one packet fits per page */
> > -   if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
> > +   if (mana_use_single_rxbuf_per_page(apc, mtu)) {
> >             if (mana_xdp_get(apc)) {
> >                     *headroom = XDP_PACKET_HEADROOM;
> >                     *alloc_size = PAGE_SIZE;
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c 
> > b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > index 7e79681634db..f22bbb325948 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > @@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = 
> > {
> >     { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, 
> > tx_pause_tc7_phy) },
> >  };
> >  
> > +static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
> > +   [MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
> > +};
> > +
> >  static int mana_get_sset_count(struct net_device *ndev, int stringset)
> >  {
> >     struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device 
> > *ndev, int stringset)
> >                    ARRAY_SIZE(mana_phy_stats) +
> >                    ARRAY_SIZE(mana_hc_stats)  +
> >                    num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
> > +
> > +   case ETH_SS_PRIV_FLAGS:
> > +           return MANA_PRIV_FLAG_MAX;
> > +
> >     default:
> >             return -EINVAL;
> >     }
> > @@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct 
> > mana_port_context *apc, u8 **data)
> >     }
> >  }
> >  
> > +static void mana_get_strings_priv_flags(u8 **data)
> > +{
> > +   int i;
> > +
> > +   for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
> > +           ethtool_puts(data, mana_priv_flags[i]);
> > +}
> > +
> >  static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 
> > *data)
> >  {
> >     struct mana_port_context *apc = netdev_priv(ndev);
> > @@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, 
> > u32 stringset, u8 *data)
> >     case ETH_SS_STATS:
> >             mana_get_strings_stats(apc, &data);
> >             break;
> > +   case ETH_SS_PRIV_FLAGS:
> > +           mana_get_strings_priv_flags(&data);
> > +           break;
> >     default:
> >             break;
> >     }
> > @@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device 
> > *ndev,
> >     return 0;
> >  }
> >  
> > +static u32 mana_get_priv_flags(struct net_device *ndev)
> > +{
> > +   struct mana_port_context *apc = netdev_priv(ndev);
> > +
> > +   return apc->priv_flags;
> > +}
> > +
> > +static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
> > +{
> > +   struct mana_port_context *apc = netdev_priv(ndev);
> > +   u32 changed = apc->priv_flags ^ priv_flags;
> > +   u32 old_priv_flags = apc->priv_flags;
> > +   bool schedule_port_reset = false;
> > +   int err = 0;
> > +
> > +   if (!changed)
> > +           return 0;
> > +
> > +   /* Reject unknown bits */
> > +   if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
> > +           return -EINVAL;
> 
> Good. Explicit rejection ensures that there's no risk of bad value. I
> think this is only required for the legacy ioctl interface, and won't be
> able to have a bit set that isn't in your accepted list. However the
> legacy ioctl interface looks like it doesn't do that double checking, so
> its good to have this.
> 
> > +
> > +   if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
> > +           apc->priv_flags = priv_flags;
> > +
> 
> In the (unlikely) event that you need another private flag in the
> future, this bit seems like it shouldn't be inside the if block here. It
> seems like you'd want to either do this at the end or up front. Of
> course it doesn't matter as long as this is the only private flag you have.
> 
> > +           if (!apc->port_is_up) {
> > +                   /* Port is down, flag updated to apply on next up
> > +                    * so just return.
> > +                    */
> > +                   return 0;
> > +           }
> > +
> > +           /* Pre-allocate buffers to prevent failure in mana_attach
> > +            * later
> > +            */
> > +           err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
> > +           if (err) {
> > +                   netdev_err(ndev,
> > +                              "Insufficient memory for new allocations\n");
> > +                   apc->priv_flags = old_priv_flags;
> > +                   return err;
> > +           }
> > +
> > +           err = mana_detach(ndev, false);
> > +           if (err) {
> > +                   netdev_err(ndev, "mana_detach failed: %d\n", err);
> > +                   apc->priv_flags = old_priv_flags;
> > +
> > +                   /* Port is in an inconsistent state. Restore
> > +                    * 'port_is_up' so that queue reset work handler
> > +                    * can properly detach and re-attach.
> > +                    */
> > +                   apc->port_is_up = true;
> > +                   schedule_port_reset = true;
> > +                   goto out;
> > +           }
> > +
> > +           err = mana_attach(ndev);
> > +           if (err) {
> > +                   netdev_err(ndev, "mana_attach failed: %d\n", err);
> > +                   apc->priv_flags = old_priv_flags;
> > +
> > +                   /* Restore 'port_is_up' so the reset work handler
> > +                    * can properly detach/attach. Without this,
> > +                    * the handler sees port_is_up=false and skips
> > +                    * queue allocation, leaving the port dead.
> > +                    */
> > +                   apc->port_is_up = true;
> > +                   schedule_port_reset = true;
> > +           }
> 
> I might have made this bit a separate function, but that comes from
> history of working with older drivers which accumulated a larger number
> of private flags. Given that we frown on adding new ones except in more
> rare cases these days, this is probably fine.
> 
> > +   }
> > +
> > +out:
> > +   mana_pre_dealloc_rxbufs(apc);
> > +
> > +   if (schedule_port_reset)
> > +           queue_work(apc->ac->per_port_queue_reset_wq,
> > +                      &apc->queue_reset_work);
> > +
> > +   return err;
> > +}
> > +
> >  const struct ethtool_ops mana_ethtool_ops = {
> >     .supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
> >     .get_ethtool_stats      = mana_get_ethtool_stats,
> > @@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
> >     .set_ringparam          = mana_set_ringparam,
> >     .get_link_ksettings     = mana_get_link_ksettings,
> >     .get_link               = ethtool_op_get_link,
> > +   .get_priv_flags         = mana_get_priv_flags,
> > +   .set_priv_flags         = mana_set_priv_flags,
> >  };
> > diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
> > index d9c27310fd04..26fd5e041a47 100644
> > --- a/include/net/mana/mana.h
> > +++ b/include/net/mana/mana.h
> > @@ -30,6 +30,12 @@ enum TRI_STATE {
> >     TRI_STATE_TRUE = 1
> >  };
> >  
> > +/* MANA ethtool private flag bit positions */
> > +enum mana_priv_flag_bits {
> > +   MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
> > +   MANA_PRIV_FLAG_MAX,
> 
> For cases like this, I find it helpful to add a comment indicating this
> must be the last entry. (and in that case, drop the trailing comma).
> 
> > +};
> > +
> >  /* Number of entries for hardware indirection table must be in power of 2 
> > */
> >  #define MANA_INDIRECT_TABLE_MAX_SIZE 512
> >  #define MANA_INDIRECT_TABLE_DEF_SIZE 64
> > @@ -531,6 +537,8 @@ struct mana_port_context {
> >     u32 rxbpre_headroom;
> >     u32 rxbpre_frag_count;
> >  
> > +   u32 priv_flags;
> > +
> >     struct bpf_prog *bpf_prog;
> >  
> >     /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
> 

Reply via email to