[PATCH net-next v10 0/4] ethtool: provide the dim profile fine-tuning channel

2024-04-25 Thread Heng Qi
The NetDIM library provides excellent acceleration for many modern
network cards. However, the default profiles of DIM limits its maximum
capabilities for different NICs, so providing a way which the NIC can
be custom configured is necessary.

Currently, the way is based on the commonly used "ethtool -C".

Please review, thank you very much!

Changelog
=
v9->v10:
  - Collect dim related flags/mode/work into one place.
  - Use rx_profile + tx_profile instead of four profiles.
  - Add several helps.
  - Update commit logs.

v8->v9:
  - Fix the compilation error of conflicting names of rx_profile in
dim.h and ice driver: in dim.h, rx_profile is replaced with
dim_rx_profile. So does tx_profile.

v7->v8:
  - Use kmemdup() instead of kzalloc()/memcpy() in dev_dim_profile_init().

v6->v7:
  - A new wrapper struct pointer is used in struct net_device.
  - Add IS_ENABLED(CONFIG_DIMLIB) to avoid compiler warnings.
  - Profile fields changed from u16 to u32.

v5->v6:
  - Place the profile in netdevice to bypass the driver.
The interaction code of ethtool <-> kernel has not changed at all,
only the interaction part of kernel <-> driver has changed.

v4->v5:
  - Update some snippets from Kuba, Thanks.

v3->v4:
  - Some tiny updates and patch 1 only add a new comment.

v2->v3:
  - Break up the attributes to avoid the use of raw c structs.
  - Use per-device profile instead of global profile in the driver.

v1->v2:
  - Use ethtool tool instead of net-sysfs

Heng Qi (4):
  linux/dim: move useful macros to .h file
  ethtool: provide customized dim profile management
  dim: add new interfaces for initialization and getting results
  virtio-net: support dim profile fine-tuning

 Documentation/netlink/specs/ethtool.yaml |  23 ++
 Documentation/networking/ethtool-netlink.rst |   4 +
 drivers/net/virtio_net.c |  44 +++-
 include/linux/dim.h  | 115 
 include/linux/ethtool.h  |   7 +-
 include/linux/netdevice.h|   5 +
 include/uapi/linux/ethtool_netlink.h |  20 ++
 lib/dim/net_dim.c| 139 ++
 net/ethtool/coalesce.c   | 264 ++-
 9 files changed, 612 insertions(+), 9 deletions(-)

-- 
2.32.0.3.g01195cf9f




[PATCH net-next v10 3/4] dim: add new interfaces for initialization and getting results

2024-04-25 Thread Heng Qi
DIM-related mode and work have been collected in one same place,
so new interfaces are added to provide convenience.

Signed-off-by: Heng Qi 
---
 include/linux/dim.h | 48 +
 lib/dim/net_dim.c   | 66 +
 2 files changed, 114 insertions(+)

diff --git a/include/linux/dim.h b/include/linux/dim.h
index af01389fcf39..ea7551bbc599 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -258,6 +258,54 @@ int net_dim_init_irq_moder(struct net_device *dev, u8 
profile_flags,
  */
 void net_dim_free_irq_moder(struct net_device *dev);
 
+/**
+ * net_dim_setting - initialize DIM's cq mode and schedule worker
+ * @dev: target network device
+ * @dim: DIM context
+ * @is_tx: true indicates the tx direction, false indicates the rx direction
+ */
+void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx);
+
+/**
+ * net_dim_work_cancel - synchronously cancel dim's worker
+ * @dim: DIM context
+ */
+void net_dim_work_cancel(struct dim *dim);
+
+/**
+ * net_dim_get_rx_irq_moder - get DIM rx results based on profile_ix
+ * @dev: target network device
+ * @dim: DIM context
+ *
+ * Return: DIM irq moderation
+ */
+struct dim_cq_moder
+net_dim_get_rx_irq_moder(struct net_device *dev, struct dim *dim);
+
+/**
+ * net_dim_get_tx_irq_moder - get DIM tx results based on profile_ix
+ * @dev: target network device
+ * @dim: DIM context
+ *
+ * Return: DIM irq moderation
+ */
+struct dim_cq_moder
+net_dim_get_tx_irq_moder(struct net_device *dev, struct dim *dim);
+
+/**
+ * net_dim_set_rx_mode - set DIM rx cq mode
+ * @dev: target network device
+ * @rx_mode: target rx cq mode
+ */
+void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode);
+
+/**
+ * net_dim_set_tx_mode - set DIM tx cq mode
+ * @dev: target network device
+ * @tx_mode: target tx cq mode
+ */
+void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode);
+
 /**
  * dim_on_top - check if current state is a good place to stop (top 
location)
  * @dim: DIM context
diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c
index ec0dc21793c0..8c4543a628e1 100644
--- a/lib/dim/net_dim.c
+++ b/lib/dim/net_dim.c
@@ -174,6 +174,72 @@ void net_dim_free_irq_moder(struct net_device *dev)
 }
 EXPORT_SYMBOL(net_dim_free_irq_moder);
 
+void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx)
+{
+   struct dim_irq_moder *irq_moder = dev->irq_moder;
+
+   if (!irq_moder)
+   return;
+
+   if (is_tx) {
+   INIT_WORK(&dim->work, irq_moder->tx_dim_work);
+   dim->mode = irq_moder->dim_tx_mode;
+   return;
+   }
+
+   INIT_WORK(&dim->work, irq_moder->rx_dim_work);
+   dim->mode = irq_moder->dim_rx_mode;
+}
+EXPORT_SYMBOL(net_dim_setting);
+
+void net_dim_work_cancel(struct dim *dim)
+{
+   cancel_work_sync(&dim->work);
+}
+EXPORT_SYMBOL(net_dim_work_cancel);
+
+struct dim_cq_moder net_dim_get_rx_irq_moder(struct net_device *dev,
+struct dim *dim)
+{
+   struct dim_cq_moder res, *profile;
+
+   rcu_read_lock();
+   profile = rcu_dereference(dev->irq_moder->rx_profile);
+   res = profile[dim->profile_ix];
+   rcu_read_unlock();
+
+   dim->mode = READ_ONCE(dev->irq_moder->dim_rx_mode);
+
+   return res;
+}
+EXPORT_SYMBOL(net_dim_get_rx_irq_moder);
+
+struct dim_cq_moder net_dim_get_tx_irq_moder(struct net_device *dev,
+struct dim *dim)
+{
+   struct dim_cq_moder res, *profile;
+
+   rcu_read_lock();
+   profile = rcu_dereference(dev->irq_moder->tx_profile);
+   res = profile[dim->profile_ix];
+   rcu_read_unlock();
+
+   dim->mode = READ_ONCE(dev->irq_moder->dim_tx_mode);
+
+   return res;
+}
+EXPORT_SYMBOL(net_dim_get_tx_irq_moder);
+
+void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode)
+{
+   WRITE_ONCE(dev->irq_moder->dim_rx_mode, rx_mode);
+}
+
+void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode)
+{
+   WRITE_ONCE(dev->irq_moder->dim_tx_mode, tx_mode);
+}
+
 static int net_dim_step(struct dim *dim)
 {
if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
-- 
2.32.0.3.g01195cf9f




[PATCH net-next v10 1/4] linux/dim: move useful macros to .h file

2024-04-25 Thread Heng Qi
Useful macros will be used effectively elsewhere.
These will be utilized in subsequent patches.

Signed-off-by: Heng Qi 
---
 include/linux/dim.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/dim.h b/include/linux/dim.h
index f343bc9aa2ec..43398f5eade2 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -10,6 +10,13 @@
 #include 
 #include 
 
+/* Number of DIM profiles and period mode. */
+#define NET_DIM_PARAMS_NUM_PROFILES 5
+#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
+#define NET_DIM_DEF_PROFILE_CQE 1
+#define NET_DIM_DEF_PROFILE_EQE 1
+
 /*
  * Number of events between DIM iterations.
  * Causes a moderation of the algorithm run.
-- 
2.32.0.3.g01195cf9f




[PATCH net-next v10 2/4] ethtool: provide customized dim profile management

2024-04-25 Thread Heng Qi
The NetDIM library, currently leveraged by an array of NICs, delivers
excellent acceleration benefits. Nevertheless, NICs vary significantly
in their dim profile list prerequisites.

Specifically, virtio-net backends may present diverse sw or hw device
implementation, making a one-size-fits-all parameter list impractical.
On Alibaba Cloud, the virtio DPU's performance under the default DIM
profile falls short of expectations, partly due to a mismatch in
parameter configuration.

I also noticed that ice/idpf/ena and other NICs have customized
profilelist or placed some restrictions on dim capabilities.

Motivated by this, I tried adding new params for "ethtool -C" that provides
a per-device control to modify and access a device's interrupt parameters.

Usage

The target NIC is named ethx.

Assume that ethx only declares support for rx profile setting
(with DIM_PROFILE_RX flag set in profile_flags) and supports modification
of usec and pkt fields.

1. Query the currently customized list of the device

$ ethtool -c ethx
...
rx-profile:
{.usec =   1, .pkts = 256, .comps = n/a,},
{.usec =   8, .pkts = 256, .comps = n/a,},
{.usec =  64, .pkts = 256, .comps = n/a,},
{.usec = 128, .pkts = 256, .comps = n/a,},
{.usec = 256, .pkts = 256, .comps = n/a,}
tx-profile:   n/a

2. Tune
$ ethtool -C ethx rx-profile 1,1,n_2,n,n_3,3,n_4,4,n_n,5,n
"n" means do not modify this field.
$ ethtool -c ethx
...
rx-profile:
{.usec =   1, .pkts =   1, .comps = n/a,},
{.usec =   2, .pkts = 256, .comps = n/a,},
{.usec =   3, .pkts =   3, .comps = n/a,},
{.usec =   4, .pkts =   4, .comps = n/a,},
{.usec = 256, .pkts =   5, .comps = n/a,}
tx-profile:   n/a

3. Hint
If the device does not support some type of customized dim profiles,
the corresponding "n/a" will display.

If the "n/a" field is being modified, -EOPNOTSUPP will be reported.

Signed-off-by: Heng Qi 
---
 Documentation/netlink/specs/ethtool.yaml |  23 ++
 Documentation/networking/ethtool-netlink.rst |   4 +
 include/linux/dim.h  |  60 +
 include/linux/ethtool.h  |   7 +-
 include/linux/netdevice.h|   5 +
 include/uapi/linux/ethtool_netlink.h |  20 ++
 lib/dim/net_dim.c|  73 +
 net/ethtool/coalesce.c   | 264 ++-
 8 files changed, 454 insertions(+), 2 deletions(-)

diff --git a/Documentation/netlink/specs/ethtool.yaml 
b/Documentation/netlink/specs/ethtool.yaml
index 87ae7b397984..3c51a1a0b5d9 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -413,6 +413,18 @@ attribute-sets:
   -
 name: combined-count
 type: u32
+  -
+name: irq-moderation
+attributes:
+  -
+name: usec
+type: u32
+  -
+name: pkts
+type: u32
+  -
+name: comps
+type: u32
 
   -
 name: coalesce
@@ -502,6 +514,15 @@ attribute-sets:
   -
 name: tx-aggr-time-usecs
 type: u32
+  -
+name: rx-profile
+type: nest
+nested-attributes: irq-moderation
+  -
+name: tx-profile
+type: nest
+nested-attributes: irq-moderation
+
   -
 name: pause-stat
 attributes:
@@ -1313,6 +1334,8 @@ operations:
 - tx-aggr-max-bytes
 - tx-aggr-max-frames
 - tx-aggr-time-usecs
+- rx-profile
+- tx-profile
   dump: *coalesce-get-op
 -
   name: coalesce-set
diff --git a/Documentation/networking/ethtool-netlink.rst 
b/Documentation/networking/ethtool-netlink.rst
index 4e63d3708ed9..78ee25081498 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1040,6 +1040,8 @@ Kernel response contents:
   ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx
   ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES``u32 max aggr packets, Tx
   ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS``u32 time (us), aggr, Tx
+  ``ETHTOOL_A_COALESCE_RX_PROFILE``nested  profile of DIM, Rx
+  ``ETHTOOL_A_COALESCE_TX_PROFILE``nested  profile of DIM, Tx
   ===  ==  ===
 
 Attributes are only included in reply if their value is not zero or the
@@ -1105,6 +1107,8 @@ Request contents:
   ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx
   ``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES``u32 max aggr packets, Tx
   ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS``u32 time (us), aggr, Tx
+  ``ETHTOOL_A_COALESCE_RX_PROFILE``nested  profile of DIM, Rx
+  ``ETHTOOL_A_COALESCE_TX_PROFILE``nested  profile of DIM, Tx
   ===  ==  ===
 
 Request is rejected if it attributes declared as unsupported by driver (i.e.
diff --git a/include/linux/dim.h b/include/linux/dim.h
index 43398f5e

[PATCH net-next v10 4/4] virtio-net: support dim profile fine-tuning

2024-04-25 Thread Heng Qi
Virtio-net has different types of back-end device implementations.
In order to effectively optimize the dim library's gains for different
device implementations, let's use the new interface params to
initialize and query dim results from a customized profile list.

Signed-off-by: Heng Qi 
---
 drivers/net/virtio_net.c | 44 +---
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 115c3c5414f2..555e6c9761da 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2300,7 +2300,7 @@ static int virtnet_open(struct net_device *dev)
 
for (i--; i >= 0; i--) {
virtnet_disable_queue_pair(vi, i);
-   cancel_work_sync(&vi->rq[i].dim.work);
+   net_dim_work_cancel(&vi->rq[i].dim);
}
 
return err;
@@ -2466,7 +2466,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi,
 
if (running) {
napi_disable(&rq->napi);
-   cancel_work_sync(&rq->dim.work);
+   net_dim_work_cancel(&rq->dim);
}
 
err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
@@ -2718,7 +2718,7 @@ static int virtnet_close(struct net_device *dev)
 
for (i = 0; i < vi->max_queue_pairs; i++) {
virtnet_disable_queue_pair(vi, i);
-   cancel_work_sync(&vi->rq[i].dim.work);
+   net_dim_work_cancel(&vi->rq[i].dim);
}
 
return 0;
@@ -3580,7 +3580,7 @@ static void virtnet_rx_dim_work(struct work_struct *work)
if (!rq->dim_enabled)
continue;
 
-   update_moder = net_dim_get_rx_moderation(dim->mode, 
dim->profile_ix);
+   update_moder = net_dim_get_rx_irq_moder(dev, dim);
if (update_moder.usec != rq->intr_coal.max_usecs ||
update_moder.pkts != rq->intr_coal.max_packets) {
err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum,
@@ -4182,6 +4182,33 @@ static void virtnet_tx_timeout(struct net_device *dev, 
unsigned int txqueue)
   jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
 }
 
+static int virtnet_init_irq_moder(struct virtnet_info *vi)
+{
+   u8 profile_flags = 0, coal_flags = 0;
+   struct net_device *dev = vi->dev;
+   int ret, i;
+
+   profile_flags |= DIM_PROFILE_RX;
+   coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS;
+   ret = net_dim_init_irq_moder(dev, profile_flags, coal_flags,
+DIM_CQ_PERIOD_MODE_START_FROM_EQE,
+0, virtnet_rx_dim_work, NULL);
+
+   if (ret)
+   return ret;
+
+   for (i = 0; i < vi->max_queue_pairs; i++)
+   net_dim_setting(vi->dev, &vi->rq[i].dim, false);
+
+   return 0;
+}
+
+static void virtnet_free_irq_moder(struct virtnet_info *vi)
+{
+   if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
+   net_dim_free_irq_moder(vi->dev);
+}
+
 static const struct net_device_ops virtnet_netdev = {
.ndo_open= virtnet_open,
.ndo_stop= virtnet_close,
@@ -4461,9 +4488,6 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 virtnet_poll_tx,
 napi_tx ? napi_weight : 0);
 
-   INIT_WORK(&vi->rq[i].dim.work, virtnet_rx_dim_work);
-   vi->rq[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
-
sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
@@ -4837,6 +4861,10 @@ static int virtnet_probe(struct virtio_device *vdev)
for (i = 0; i < vi->max_queue_pairs; i++)
if (vi->sq[i].napi.weight)
vi->sq[i].intr_coal.max_packets = 1;
+
+   err = virtnet_init_irq_moder(vi);
+   if (err)
+   goto free;
}
 
 #ifdef CONFIG_SYSFS
@@ -4961,6 +4989,8 @@ static void virtnet_remove(struct virtio_device *vdev)
disable_rx_mode_work(vi);
flush_work(&vi->rx_mode_work);
 
+   virtnet_free_irq_moder(vi);
+
unregister_netdev(vi->dev);
 
net_failover_destroy(vi->failover);
-- 
2.32.0.3.g01195cf9f