On 06/12/2018 08:18 AM, Alexander Duyck wrote: > This patch is meant to provide the basic tools needed to allow us to create > subordinate device traffic classes. The general idea here is to allow > subdividing the queues of a device into queue groups accessible through an > upper device such as a macvlan. > > The idea here is to enforce the idea that an upper device has to be a > single queue device, ideally with IFF_NO_QUQUE set. With that being the > case we can pretty much guarantee that the tc_to_txq mappings and XPS maps > for the upper device are unused. As such we could reuse those in order to > support subdividing the lower device and distributing those queues between > the subordinate devices.
This is not necessarily a valid paradigm to work with. For instance in DSA we have IFF_NO_QUEUE devices, but we still expose multiple egress queues because that is how an application can choose how it wants to get packets transmitted at the switch level. We have a 1:1 representation between a queue at the net_device level, and what an egress queue at the switch level is, so things like buffer reservation etc. can be configured. I think you should consider that an upper device might want to have a 1:1 mapping to the lower device's queues and make that permissible. Thoughts? > > In order to distinguish between a regular set of traffic classes and if a > device is carrying subordinate traffic classes I changed num_tc from a u8 > to a s16 value and use the negative values to represent the suboordinate > pool values. So starting at -1 and running to -32768 we can encode those as > pool values, and the existing values of 0 to 15 can be maintained. > > Signed-off-by: Alexander Duyck <alexander.h.du...@intel.com> > --- > include/linux/netdevice.h | 16 ++++++++ > net/core/dev.c | 89 > +++++++++++++++++++++++++++++++++++++++++++++ > net/core/net-sysfs.c | 21 ++++++++++- > 3 files changed, 124 insertions(+), 2 deletions(-) > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index 3ec9850..41b4660 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -569,6 +569,9 @@ struct netdev_queue { > * (/sys/class/net/DEV/Q/trans_timeout) > */ > unsigned long trans_timeout; > + > + /* Suboordinate device that the queue has been assigned to */ > + struct net_device *sb_dev; > /* > * write-mostly part > */ > @@ -1978,7 +1981,7 @@ struct net_device { > #ifdef CONFIG_DCB > const struct dcbnl_rtnl_ops *dcbnl_ops; > #endif > - u8 num_tc; > + s16 num_tc; > struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; > u8 prio_tc_map[TC_BITMASK + 1]; > > @@ -2032,6 +2035,17 @@ int netdev_get_num_tc(struct net_device *dev) > return dev->num_tc; > } > > +void netdev_unbind_sb_channel(struct net_device *dev, > + struct net_device *sb_dev); > +int netdev_bind_sb_channel_queue(struct net_device *dev, > + struct net_device *sb_dev, > + u8 tc, u16 count, u16 offset); > +int netdev_set_sb_channel(struct net_device *dev, u16 channel); > +static inline int netdev_get_sb_channel(struct net_device *dev) > +{ > + return max_t(int, -dev->num_tc, 0); > +} > + > static inline > struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, > unsigned int index) > diff --git a/net/core/dev.c b/net/core/dev.c > index 6e18242..27fe4f2 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -2068,11 +2068,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned > int txq) > struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; > int i; > > + /* walk through the TCs and see if it falls into any of them */ > for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { > if ((txq - tc->offset) < tc->count) > return i; > } > > + /* didn't find it, just return -1 to indicate no match */ > return -1; > } > > @@ -2215,7 +2217,14 @@ int netif_set_xps_queue(struct net_device *dev, const > struct cpumask *mask, > bool active = false; > > if (dev->num_tc) { > + /* Do not allow XPS on subordinate device directly */ > num_tc = dev->num_tc; > + if (num_tc < 0) > + return -EINVAL; > + > + /* If queue belongs to subordinate dev use its map */ > + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; > + > tc = netdev_txq_to_tc(dev, index); > if (tc < 0) > return -EINVAL; > @@ -2366,11 +2375,25 @@ int netif_set_xps_queue(struct net_device *dev, const > struct cpumask *mask, > EXPORT_SYMBOL(netif_set_xps_queue); > > #endif > +static void netdev_unbind_all_sb_channels(struct net_device *dev) > +{ > + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; > + > + /* Unbind any subordinate channels */ > + while (txq-- != &dev->_tx[0]) { > + if (txq->sb_dev) > + netdev_unbind_sb_channel(dev, txq->sb_dev); > + } > +} > + > void netdev_reset_tc(struct net_device *dev) > { > #ifdef CONFIG_XPS > netif_reset_xps_queues_gt(dev, 0); > #endif > + netdev_unbind_all_sb_channels(dev); > + > + /* Reset TC configuration of device */ > dev->num_tc = 0; > memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); > memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); > @@ -2399,11 +2422,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 > num_tc) > #ifdef CONFIG_XPS > netif_reset_xps_queues_gt(dev, 0); > #endif > + netdev_unbind_all_sb_channels(dev); > + > dev->num_tc = num_tc; > return 0; > } > EXPORT_SYMBOL(netdev_set_num_tc); > > +void netdev_unbind_sb_channel(struct net_device *dev, > + struct net_device *sb_dev) > +{ > + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; > + > +#ifdef CONFIG_XPS > + netif_reset_xps_queues_gt(sb_dev, 0); > +#endif > + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); > + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); > + > + while (txq-- != &dev->_tx[0]) { > + if (txq->sb_dev == sb_dev) > + txq->sb_dev = NULL; > + } > +} > +EXPORT_SYMBOL(netdev_unbind_sb_channel); > + > +int netdev_bind_sb_channel_queue(struct net_device *dev, > + struct net_device *sb_dev, > + u8 tc, u16 count, u16 offset) > +{ > + /* Make certain the sb_dev and dev are already configured */ > + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) > + return -EINVAL; > + > + /* We cannot hand out queues we don't have */ > + if ((offset + count) > dev->real_num_tx_queues) > + return -EINVAL; > + > + /* Record the mapping */ > + sb_dev->tc_to_txq[tc].count = count; > + sb_dev->tc_to_txq[tc].offset = offset; > + > + /* Provide a way for Tx queue to find the tc_to_txq map or > + * XPS map for itself. > + */ > + while (count--) > + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; > + > + return 0; > +} > +EXPORT_SYMBOL(netdev_bind_sb_channel_queue); > + > +int netdev_set_sb_channel(struct net_device *dev, u16 channel) > +{ > + /* Do not use a multiqueue device to represent a subordinate channel */ > + if (netif_is_multiqueue(dev)) > + return -ENODEV; > + > + /* We allow channels 1 - 32767 to be used for subordinate channels. > + * Channel 0 is meant to be "native" mode and used only to represent > + * the main root device. We allow writing 0 to reset the device back > + * to normal mode after being used as a subordinate channel. > + */ > + if (channel > S16_MAX) > + return -EINVAL; > + > + dev->num_tc = -channel; > + > + return 0; > +} > +EXPORT_SYMBOL(netdev_set_sb_channel); > + > /* > * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues > * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. > diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c > index 335c6a4..bd067b1 100644 > --- a/net/core/net-sysfs.c > +++ b/net/core/net-sysfs.c > @@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue > *queue, > return -ENOENT; > > index = get_netdev_queue_index(queue); > + > + /* If queue belongs to subordinate dev use its tc mapping */ > + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; > + > tc = netdev_txq_to_tc(dev, index); > if (tc < 0) > return -EINVAL; > > - return sprintf(buf, "%u\n", tc); > + /* We can report the traffic class one of two ways: > + * Subordinate device traffic classes are reported with the traffic > + * class first, and then the subordinate class so for example TC0 on > + * subordinate device 2 will be reported as "0-2". If the queue > + * belongs to the root device it will be reported with just the > + * traffic class, so just "0" for TC 0 for example. > + */ > + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) : > + sprintf(buf, "%u\n", tc); > } > > #ifdef CONFIG_XPS > @@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue > *queue, > index = get_netdev_queue_index(queue); > > if (dev->num_tc) { > + /* Do not allow XPS on subordinate device directly */ > num_tc = dev->num_tc; > + if (num_tc < 0) > + return -EINVAL; > + > + /* If queue belongs to subordinate dev use its map */ > + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; > + > tc = netdev_txq_to_tc(dev, index); > if (tc < 0) > return -EINVAL; > -- Florian