A common task for most drivers is to remember the user-set CPU affinity to its IRQs. On each netdev reset, the driver should re-assign the user's settings to the IRQs.
Add CPU affinity mask to napi_config. To delegate the CPU affinity management to the core, drivers must: 1 - set the new netdev flag "irq_affinity_auto": netif_enable_irq_affinity(netdev) 2 - create the napi with persistent config: netif_napi_add_config() 3 - bind an IRQ to the napi instance: netif_napi_set_irq() the core will then make sure to use re-assign affinity to the napi's IRQ. The default IRQ mask is set to one cpu starting from the closest NUMA. Signed-off-by: Ahmed Zaki <ahmed.z...@intel.com> --- include/linux/netdevice.h | 14 +++++++++++- net/core/dev.c | 46 +++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f8b416aa32b..8b31fff8affa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -351,6 +351,7 @@ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; + cpumask_t affinity_mask; unsigned int napi_id; }; @@ -392,8 +393,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; -#ifdef CONFIG_RFS_ACCEL struct irq_affinity_notify notify; +#ifdef CONFIG_RFS_ACCEL int napi_rmap_idx; #endif int index; @@ -1995,6 +1996,11 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @irq_affinity_auto: driver wants the core to manage the IRQ affinity. + * Set by netif_napi_set_irq(), then driver must + * create persistent napi by netif_napi_add_config() + * and finally bind napi to IRQ (netif_napi_set_irq). + * * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. * Set by calling netif_enable_cpu_rmap(). * @@ -2405,6 +2411,7 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; + bool irq_affinity_auto; #ifdef CONFIG_RFS_ACCEL bool rx_cpu_rmap_auto; #endif @@ -2640,6 +2647,11 @@ static inline void netdev_set_ml_priv(struct net_device *dev, dev->ml_priv_type = type; } +static inline void netif_enable_irq_affinity(struct net_device *dev) +{ + dev->irq_affinity_auto = true; +} + /* * Net namespace inlines */ diff --git a/net/core/dev.c b/net/core/dev.c index c965d947b33d..1fb850322868 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6776,27 +6776,36 @@ int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) return 0; } EXPORT_SYMBOL(netif_enable_cpu_rmap); +#endif static void -netif_irq_cpu_rmap_notify(struct irq_affinity_notify *notify, - const cpumask_t *mask) +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) { struct napi_struct *napi = container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; int err; +#endif + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL if (rmap && napi->dev->rx_cpu_rmap_auto) { err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); if (err) pr_warn("%s: RMAP update failed (%d)\n", __func__, err); } +#endif } static void netif_napi_affinity_release(struct kref *ref) { +#ifdef CONFIG_RFS_ACCEL struct napi_struct *napi = container_of(ref, struct napi_struct, notify.kref); struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; @@ -6806,8 +6815,10 @@ netif_napi_affinity_release(struct kref *ref) rmap->obj[napi->napi_rmap_idx] = NULL; cpu_rmap_put(rmap); +#endif } +#ifdef CONFIG_RFS_ACCEL static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) { struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; @@ -6816,7 +6827,7 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) if (!rmap) return -EINVAL; - napi->notify.notify = netif_irq_cpu_rmap_notify; + napi->notify.notify = netif_napi_irq_notify; napi->notify.release = netif_napi_affinity_release; cpu_rmap_get(rmap); rc = cpu_rmap_add(rmap, napi); @@ -6840,9 +6851,8 @@ static int napi_irq_cpu_rmap_add(struct napi_struct *napi, int irq) void netif_napi_set_irq(struct napi_struct *napi, int irq) { -#ifdef CONFIG_RFS_ACCEL int rc; -#endif + napi->irq = irq; #ifdef CONFIG_RFS_ACCEL @@ -6853,8 +6863,18 @@ void netif_napi_set_irq(struct napi_struct *napi, int irq) rc); netif_disable_cpu_rmap(napi->dev); } - } + } else if (irq > 0 && napi->config && napi->dev->irq_affinity_auto) { +#else + if (irq > 0 && napi->config && napi->dev->irq_affinity_auto) { #endif + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + } } EXPORT_SYMBOL(netif_napi_set_irq); @@ -6863,6 +6883,10 @@ static void napi_restore_config(struct napi_struct *n) n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->irq > 0 && n->dev->irq_affinity_auto) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ @@ -6879,6 +6903,10 @@ static void napi_save_config(struct napi_struct *n) n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; + + if (n->irq > 0 && n->dev->irq_affinity_auto) + irq_set_affinity_notifier(n->irq, NULL); + napi_hash_del(n); } @@ -11377,7 +11405,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, { struct net_device *dev; size_t napi_config_sz; - unsigned int maxqs; + unsigned int maxqs, i, numa; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -11473,6 +11501,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) goto free_all; + numa = dev_to_node(&dev->dev); + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); strscpy(dev->name, name); dev->name_assign_type = name_assign_type; -- 2.43.0