[PATCH net-next v10 0/4] ethtool: provide the dim profile fine-tuning channel
The NetDIM library provides excellent acceleration for many modern network cards. However, the default profiles of DIM limits its maximum capabilities for different NICs, so providing a way which the NIC can be custom configured is necessary. Currently, the way is based on the commonly used "ethtool -C". Please review, thank you very much! Changelog = v9->v10: - Collect dim related flags/mode/work into one place. - Use rx_profile + tx_profile instead of four profiles. - Add several helps. - Update commit logs. v8->v9: - Fix the compilation error of conflicting names of rx_profile in dim.h and ice driver: in dim.h, rx_profile is replaced with dim_rx_profile. So does tx_profile. v7->v8: - Use kmemdup() instead of kzalloc()/memcpy() in dev_dim_profile_init(). v6->v7: - A new wrapper struct pointer is used in struct net_device. - Add IS_ENABLED(CONFIG_DIMLIB) to avoid compiler warnings. - Profile fields changed from u16 to u32. v5->v6: - Place the profile in netdevice to bypass the driver. The interaction code of ethtool <-> kernel has not changed at all, only the interaction part of kernel <-> driver has changed. v4->v5: - Update some snippets from Kuba, Thanks. v3->v4: - Some tiny updates and patch 1 only add a new comment. v2->v3: - Break up the attributes to avoid the use of raw c structs. - Use per-device profile instead of global profile in the driver. v1->v2: - Use ethtool tool instead of net-sysfs Heng Qi (4): linux/dim: move useful macros to .h file ethtool: provide customized dim profile management dim: add new interfaces for initialization and getting results virtio-net: support dim profile fine-tuning Documentation/netlink/specs/ethtool.yaml | 23 ++ Documentation/networking/ethtool-netlink.rst | 4 + drivers/net/virtio_net.c | 44 +++- include/linux/dim.h | 115 include/linux/ethtool.h | 7 +- include/linux/netdevice.h| 5 + include/uapi/linux/ethtool_netlink.h | 20 ++ lib/dim/net_dim.c| 139 ++ net/ethtool/coalesce.c | 264 ++- 9 files changed, 612 insertions(+), 9 deletions(-) -- 2.32.0.3.g01195cf9f
[PATCH net-next v10 3/4] dim: add new interfaces for initialization and getting results
DIM-related mode and work have been collected in one same place, so new interfaces are added to provide convenience. Signed-off-by: Heng Qi --- include/linux/dim.h | 48 + lib/dim/net_dim.c | 66 + 2 files changed, 114 insertions(+) diff --git a/include/linux/dim.h b/include/linux/dim.h index af01389fcf39..ea7551bbc599 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -258,6 +258,54 @@ int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags, */ void net_dim_free_irq_moder(struct net_device *dev); +/** + * net_dim_setting - initialize DIM's cq mode and schedule worker + * @dev: target network device + * @dim: DIM context + * @is_tx: true indicates the tx direction, false indicates the rx direction + */ +void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx); + +/** + * net_dim_work_cancel - synchronously cancel dim's worker + * @dim: DIM context + */ +void net_dim_work_cancel(struct dim *dim); + +/** + * net_dim_get_rx_irq_moder - get DIM rx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_rx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_get_tx_irq_moder - get DIM tx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_tx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_set_rx_mode - set DIM rx cq mode + * @dev: target network device + * @rx_mode: target rx cq mode + */ +void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode); + +/** + * net_dim_set_tx_mode - set DIM tx cq mode + * @dev: target network device + * @tx_mode: target tx cq mode + */ +void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode); + /** * dim_on_top - check if current state is a good place to stop (top location) * @dim: DIM context diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index ec0dc21793c0..8c4543a628e1 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -174,6 +174,72 @@ void net_dim_free_irq_moder(struct net_device *dev) } EXPORT_SYMBOL(net_dim_free_irq_moder); +void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx) +{ + struct dim_irq_moder *irq_moder = dev->irq_moder; + + if (!irq_moder) + return; + + if (is_tx) { + INIT_WORK(&dim->work, irq_moder->tx_dim_work); + dim->mode = irq_moder->dim_tx_mode; + return; + } + + INIT_WORK(&dim->work, irq_moder->rx_dim_work); + dim->mode = irq_moder->dim_rx_mode; +} +EXPORT_SYMBOL(net_dim_setting); + +void net_dim_work_cancel(struct dim *dim) +{ + cancel_work_sync(&dim->work); +} +EXPORT_SYMBOL(net_dim_work_cancel); + +struct dim_cq_moder net_dim_get_rx_irq_moder(struct net_device *dev, +struct dim *dim) +{ + struct dim_cq_moder res, *profile; + + rcu_read_lock(); + profile = rcu_dereference(dev->irq_moder->rx_profile); + res = profile[dim->profile_ix]; + rcu_read_unlock(); + + dim->mode = READ_ONCE(dev->irq_moder->dim_rx_mode); + + return res; +} +EXPORT_SYMBOL(net_dim_get_rx_irq_moder); + +struct dim_cq_moder net_dim_get_tx_irq_moder(struct net_device *dev, +struct dim *dim) +{ + struct dim_cq_moder res, *profile; + + rcu_read_lock(); + profile = rcu_dereference(dev->irq_moder->tx_profile); + res = profile[dim->profile_ix]; + rcu_read_unlock(); + + dim->mode = READ_ONCE(dev->irq_moder->dim_tx_mode); + + return res; +} +EXPORT_SYMBOL(net_dim_get_tx_irq_moder); + +void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode) +{ + WRITE_ONCE(dev->irq_moder->dim_rx_mode, rx_mode); +} + +void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode) +{ + WRITE_ONCE(dev->irq_moder->dim_tx_mode, tx_mode); +} + static int net_dim_step(struct dim *dim) { if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2)) -- 2.32.0.3.g01195cf9f
[PATCH net-next v10 1/4] linux/dim: move useful macros to .h file
Useful macros will be used effectively elsewhere. These will be utilized in subsequent patches. Signed-off-by: Heng Qi --- include/linux/dim.h | 7 +++ 1 file changed, 7 insertions(+) diff --git a/include/linux/dim.h b/include/linux/dim.h index f343bc9aa2ec..43398f5eade2 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -10,6 +10,13 @@ #include #include +/* Number of DIM profiles and period mode. */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + /* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. -- 2.32.0.3.g01195cf9f
[PATCH net-next v10 2/4] ethtool: provide customized dim profile management
The NetDIM library, currently leveraged by an array of NICs, delivers excellent acceleration benefits. Nevertheless, NICs vary significantly in their dim profile list prerequisites. Specifically, virtio-net backends may present diverse sw or hw device implementation, making a one-size-fits-all parameter list impractical. On Alibaba Cloud, the virtio DPU's performance under the default DIM profile falls short of expectations, partly due to a mismatch in parameter configuration. I also noticed that ice/idpf/ena and other NICs have customized profilelist or placed some restrictions on dim capabilities. Motivated by this, I tried adding new params for "ethtool -C" that provides a per-device control to modify and access a device's interrupt parameters. Usage The target NIC is named ethx. Assume that ethx only declares support for rx profile setting (with DIM_PROFILE_RX flag set in profile_flags) and supports modification of usec and pkt fields. 1. Query the currently customized list of the device $ ethtool -c ethx ... rx-profile: {.usec = 1, .pkts = 256, .comps = n/a,}, {.usec = 8, .pkts = 256, .comps = n/a,}, {.usec = 64, .pkts = 256, .comps = n/a,}, {.usec = 128, .pkts = 256, .comps = n/a,}, {.usec = 256, .pkts = 256, .comps = n/a,} tx-profile: n/a 2. Tune $ ethtool -C ethx rx-profile 1,1,n_2,n,n_3,3,n_4,4,n_n,5,n "n" means do not modify this field. $ ethtool -c ethx ... rx-profile: {.usec = 1, .pkts = 1, .comps = n/a,}, {.usec = 2, .pkts = 256, .comps = n/a,}, {.usec = 3, .pkts = 3, .comps = n/a,}, {.usec = 4, .pkts = 4, .comps = n/a,}, {.usec = 256, .pkts = 5, .comps = n/a,} tx-profile: n/a 3. Hint If the device does not support some type of customized dim profiles, the corresponding "n/a" will display. If the "n/a" field is being modified, -EOPNOTSUPP will be reported. Signed-off-by: Heng Qi --- Documentation/netlink/specs/ethtool.yaml | 23 ++ Documentation/networking/ethtool-netlink.rst | 4 + include/linux/dim.h | 60 + include/linux/ethtool.h | 7 +- include/linux/netdevice.h| 5 + include/uapi/linux/ethtool_netlink.h | 20 ++ lib/dim/net_dim.c| 73 + net/ethtool/coalesce.c | 264 ++- 8 files changed, 454 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 87ae7b397984..3c51a1a0b5d9 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -413,6 +413,18 @@ attribute-sets: - name: combined-count type: u32 + - +name: irq-moderation +attributes: + - +name: usec +type: u32 + - +name: pkts +type: u32 + - +name: comps +type: u32 - name: coalesce @@ -502,6 +514,15 @@ attribute-sets: - name: tx-aggr-time-usecs type: u32 + - +name: rx-profile +type: nest +nested-attributes: irq-moderation + - +name: tx-profile +type: nest +nested-attributes: irq-moderation + - name: pause-stat attributes: @@ -1313,6 +1334,8 @@ operations: - tx-aggr-max-bytes - tx-aggr-max-frames - tx-aggr-time-usecs +- rx-profile +- tx-profile dump: *coalesce-get-op - name: coalesce-set diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 4e63d3708ed9..78ee25081498 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1040,6 +1040,8 @@ Kernel response contents: ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES``u32 max aggr packets, Tx ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS``u32 time (us), aggr, Tx + ``ETHTOOL_A_COALESCE_RX_PROFILE``nested profile of DIM, Rx + ``ETHTOOL_A_COALESCE_TX_PROFILE``nested profile of DIM, Tx === == === Attributes are only included in reply if their value is not zero or the @@ -1105,6 +1107,8 @@ Request contents: ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES``u32 max aggr packets, Tx ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS``u32 time (us), aggr, Tx + ``ETHTOOL_A_COALESCE_RX_PROFILE``nested profile of DIM, Rx + ``ETHTOOL_A_COALESCE_TX_PROFILE``nested profile of DIM, Tx === == === Request is rejected if it attributes declared as unsupported by driver (i.e. diff --git a/include/linux/dim.h b/include/linux/dim.h index 43398f5e
[PATCH net-next v10 4/4] virtio-net: support dim profile fine-tuning
Virtio-net has different types of back-end device implementations. In order to effectively optimize the dim library's gains for different device implementations, let's use the new interface params to initialize and query dim results from a customized profile list. Signed-off-by: Heng Qi --- drivers/net/virtio_net.c | 44 +--- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 115c3c5414f2..555e6c9761da 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2300,7 +2300,7 @@ static int virtnet_open(struct net_device *dev) for (i--; i >= 0; i--) { virtnet_disable_queue_pair(vi, i); - cancel_work_sync(&vi->rq[i].dim.work); + net_dim_work_cancel(&vi->rq[i].dim); } return err; @@ -2466,7 +2466,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, if (running) { napi_disable(&rq->napi); - cancel_work_sync(&rq->dim.work); + net_dim_work_cancel(&rq->dim); } err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); @@ -2718,7 +2718,7 @@ static int virtnet_close(struct net_device *dev) for (i = 0; i < vi->max_queue_pairs; i++) { virtnet_disable_queue_pair(vi, i); - cancel_work_sync(&vi->rq[i].dim.work); + net_dim_work_cancel(&vi->rq[i].dim); } return 0; @@ -3580,7 +3580,7 @@ static void virtnet_rx_dim_work(struct work_struct *work) if (!rq->dim_enabled) continue; - update_moder = net_dim_get_rx_moderation(dim->mode, dim->profile_ix); + update_moder = net_dim_get_rx_irq_moder(dev, dim); if (update_moder.usec != rq->intr_coal.max_usecs || update_moder.pkts != rq->intr_coal.max_packets) { err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, @@ -4182,6 +4182,33 @@ static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start))); } +static int virtnet_init_irq_moder(struct virtnet_info *vi) +{ + u8 profile_flags = 0, coal_flags = 0; + struct net_device *dev = vi->dev; + int ret, i; + + profile_flags |= DIM_PROFILE_RX; + coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS; + ret = net_dim_init_irq_moder(dev, profile_flags, coal_flags, +DIM_CQ_PERIOD_MODE_START_FROM_EQE, +0, virtnet_rx_dim_work, NULL); + + if (ret) + return ret; + + for (i = 0; i < vi->max_queue_pairs; i++) + net_dim_setting(vi->dev, &vi->rq[i].dim, false); + + return 0; +} + +static void virtnet_free_irq_moder(struct virtnet_info *vi) +{ + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + net_dim_free_irq_moder(vi->dev); +} + static const struct net_device_ops virtnet_netdev = { .ndo_open= virtnet_open, .ndo_stop= virtnet_close, @@ -4461,9 +4488,6 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) virtnet_poll_tx, napi_tx ? napi_weight : 0); - INIT_WORK(&vi->rq[i].dim.work, virtnet_rx_dim_work); - vi->rq[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; - sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); @@ -4837,6 +4861,10 @@ static int virtnet_probe(struct virtio_device *vdev) for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight) vi->sq[i].intr_coal.max_packets = 1; + + err = virtnet_init_irq_moder(vi); + if (err) + goto free; } #ifdef CONFIG_SYSFS @@ -4961,6 +4989,8 @@ static void virtnet_remove(struct virtio_device *vdev) disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); + virtnet_free_irq_moder(vi); + unregister_netdev(vi->dev); net_failover_destroy(vi->failover); -- 2.32.0.3.g01195cf9f