Two race conditions can be triggered in xps, resulting in various oops and invalid memory accesses:
1. Calling netdev_set_num_tc while netif_set_xps_queue: - netdev_set_num_tc sets dev->tc_num. - netif_set_xps_queue uses dev->tc_num as one of the parameters to compute the size of new_dev_maps when allocating it. dev->tc_num is also used to access the map, and the compiler may generate code to retrieve this field multiple times in the function. If new_dev_maps is allocated using dev->tc_num and then dev->tc_num is set to a higher value through netdev_set_num_tc, later accesses to new_dev_maps in netif_set_xps_queue could lead to accessing memory outside of new_dev_maps; triggering an oops. One way of triggering this is to set an iface up (for which the driver uses netdev_set_num_tc in the open path, such as bnx2x) and writing to xps_cpus or xps_rxqs in a concurrent thread. With the right timing an oops is triggered. 2. Calling netif_set_xps_queue while netdev_set_num_tc is running: 2.1. netdev_set_num_tc starts by resetting the xps queues, dev->tc_num isn't updated yet. 2.2. netif_set_xps_queue is called, setting up the maps with the *old* dev->num_tc. 2.3. dev->tc_num is updated. 2.3. Later accesses to the map leads to out of bound accesses and oops. A similar issue can be found with netdev_reset_tc. The fix can't be to only link the size of the maps to them, as invalid configuration could still occur. The reset then set logic in both netdev_set_num_tc and netdev_reset_tc must be protected by a lock. Both issues have the same fix: netif_set_xps_queue, netdev_set_num_tc and netdev_reset_tc should be mutually exclusive. This patch fixes those races by: - Reworking netif_set_xps_queue by moving the xps_map_mutex up so the access of dev->num_tc is done under the lock. - Using xps_map_mutex in both netdev_set_num_tc and netdev_reset_tc for the reset and set logic: + As xps_map_mutex was taken in the reset path, netif_reset_xps_queues had to be reworked to offer an unlocked version (as well as netdev_unbind_all_sb_channels which calls it). + cpus_read_lock was taken in the reset path as well, and is always taken before xps_map_mutex. It had to be moved out of the unlocked version as well. This is why the patch is a little bit longer, and moves netdev_unbind_sb_channel up in the file. Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes") Signed-off-by: Antoine Tenart <aten...@kernel.org> --- net/core/dev.c | 122 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 41 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8fa739259041..effdb7fee9df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2527,8 +2527,8 @@ static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, } } -static void netif_reset_xps_queues(struct net_device *dev, u16 offset, - u16 count) +static void __netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) { const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; @@ -2537,9 +2537,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (!static_key_false(&xps_needed)) return; - cpus_read_lock(); - mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); if (dev_maps) { @@ -2551,15 +2548,23 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) - goto out_no_maps; + return; if (num_possible_cpus() > 1) possible_mask = cpumask_bits(cpu_possible_mask); nr_ids = nr_cpu_ids; clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, false); +} + +static void netif_reset_xps_queues(struct net_device *dev, u16 offset, + u16 count) +{ + cpus_read_lock(); + mutex_lock(&xps_map_mutex); + + __netif_reset_xps_queues(dev, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2615,27 +2620,32 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, { const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; - int i, j, tci, numa_node_id = -2; + int i, j, tci, numa_node_id = -2, ret = 0; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; unsigned int nr_ids; + mutex_lock(&xps_map_mutex); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto unlock; + } } - mutex_lock(&xps_map_mutex); if (is_rxqs_map) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); dev_maps = xmap_dereference(dev->xps_rxqs_map); @@ -2659,8 +2669,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + ret = -ENOMEM; + goto unlock; } tci = j * num_tc + tc; @@ -2765,7 +2775,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } if (!dev_maps) - goto out_no_maps; + goto unlock; /* removes tx-queue from unused CPUs/rx-queues */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), @@ -2783,10 +2793,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!active) reset_xps_maps(dev, dev_maps, is_rxqs_map); -out_no_maps: +unlock: mutex_unlock(&xps_map_mutex); - return 0; + return ret; error: /* remove any maps that we added */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), @@ -2822,28 +2832,68 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, EXPORT_SYMBOL(netif_set_xps_queue); #endif -static void netdev_unbind_all_sb_channels(struct net_device *dev) + +static void __netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; + +#ifdef CONFIG_XPS + __netif_reset_xps_queues(sb_dev, 0, dev->num_tx_queues); +#endif + + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); + + while (txq-- != &dev->_tx[0]) { + if (txq->sb_dev == sb_dev) + txq->sb_dev = NULL; + } +} + +void netdev_unbind_sb_channel(struct net_device *dev, + struct net_device *sb_dev) +{ + cpus_read_lock(); + mutex_lock(&xps_map_mutex); + + __netdev_unbind_sb_channel(dev, sb_dev); + + mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); +} +EXPORT_SYMBOL(netdev_unbind_sb_channel); + +static void __netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) - netdev_unbind_sb_channel(dev, txq->sb_dev); + __netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS - netif_reset_xps_queues_gt(dev, 0); + cpus_read_lock(); + mutex_lock(&xps_map_mutex); + + __netif_reset_xps_queues(dev, 0, dev->num_tx_queues); #endif - netdev_unbind_all_sb_channels(dev); + __netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); + +#ifdef CONFIG_XPS + mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); +#endif } EXPORT_SYMBOL(netdev_reset_tc); @@ -2867,32 +2917,22 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc) return -EINVAL; #ifdef CONFIG_XPS - netif_reset_xps_queues_gt(dev, 0); + cpus_read_lock(); + mutex_lock(&xps_map_mutex); + + __netif_reset_xps_queues(dev, 0, dev->num_tx_queues); #endif - netdev_unbind_all_sb_channels(dev); + __netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; - return 0; -} -EXPORT_SYMBOL(netdev_set_num_tc); - -void netdev_unbind_sb_channel(struct net_device *dev, - struct net_device *sb_dev) -{ - struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS - netif_reset_xps_queues_gt(sb_dev, 0); + mutex_unlock(&xps_map_mutex); + cpus_read_unlock(); #endif - memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); - memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); - - while (txq-- != &dev->_tx[0]) { - if (txq->sb_dev == sb_dev) - txq->sb_dev = NULL; - } + return 0; } -EXPORT_SYMBOL(netdev_unbind_sb_channel); +EXPORT_SYMBOL(netdev_set_num_tc); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, -- 2.29.2