date:20240914

[PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature

2024-09-14 Thread Akihiko Odaki

virtio-net have two usage of hashes: one is RSS and another is hash
reporting. Conventionally the hash calculation was done by the VMM.
However, computing the hash after the queue was chosen defeats the
purpose of RSS.

Another approach is to use eBPF steering program. This approach has
another downside: it cannot report the calculated hash due to the
restrictive nature of eBPF.

Introduce the code to compute hashes to the kernel in order to overcome
thse challenges.

An alternative solution is to extend the eBPF steering program so that it
will be able to report to the userspace, but it is based on context
rewrites, which is in feature freeze. We can adopt kfuncs, but they will
not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM
and vhost_net).

QEMU patched to use this new feature is available at:
https://github.com/daynix/qemu/tree/akihikodaki/rss2

The QEMU patches will soon be submitted to the upstream as RFC too.

This work will be presented at LPC 2024:
https://lpc.events/event/18/contributions/1963/

V1 -> V2:
  Changed to introduce a new BPF program type.

Signed-off-by: Akihiko Odaki 
---
Changes in v3:
- Reverted back to add ioctl.
- Split patch "tun: Introduce virtio-net hashing feature" into
  "tun: Introduce virtio-net hash reporting feature" and
  "tun: Introduce virtio-net RSS".
- Changed to reuse hash values computed for automq instead of performing
  RSS hashing when hash reporting is requested but RSS is not.
- Extracted relevant data from struct tun_struct to keep it minimal.
- Added kernel-doc.
- Changed to allow calling TUNGETVNETHASHCAP before TUNSETIFF.
- Initialized num_buffers with 1.
- Added a test case for unclassified packets.
- Fixed error handling in tests.
- Changed tests to verify that the queue index will not overflow.
- Rebased.
- Link to v2: 
https://lore.kernel.org/r/20231015141644.260646-1-akihiko.od...@daynix.com

---
Akihiko Odaki (9):
  skbuff: Introduce SKB_EXT_TUN_VNET_HASH
  virtio_net: Add functions for hashing
  net: flow_dissector: Export flow_keys_dissector_symmetric
  tap: Pad virtio header with zero
  tun: Pad virtio header with zero
  tun: Introduce virtio-net hash reporting feature
  tun: Introduce virtio-net RSS
  selftest: tun: Add tests for virtio-net hashing
  vhost/net: Support VIRTIO_NET_F_HASH_REPORT

 Documentation/networking/tuntap.rst  |   7 +
 drivers/net/Kconfig  |   1 +
 drivers/net/tap.c|   2 +-
 drivers/net/tun.c| 255 --
 drivers/vhost/net.c  |  16 +-
 include/linux/skbuff.h   |  10 +
 include/linux/virtio_net.h   | 198 +++
 include/net/flow_dissector.h |   1 +
 include/uapi/linux/if_tun.h  |  71 
 net/core/flow_dissector.c|   3 +-
 net/core/skbuff.c|   3 +
 tools/testing/selftests/net/Makefile |   2 +-
 tools/testing/selftests/net/tun.c| 666 ++-
 13 files changed, 1195 insertions(+), 40 deletions(-)
---
base-commit: 46a0057a5853cbdb58211c19e89badc6fd50
change-id: 20240403-rss-e737d89efa77

Best regards,
-- 
Akihiko Odaki

[PATCH RFC v3 1/9] skbuff: Introduce SKB_EXT_TUN_VNET_HASH

2024-09-14 Thread Akihiko Odaki

This new extension will be used by tun to carry the hash values and
types to report with virtio-net headers.

Signed-off-by: Akihiko Odaki 
---
 include/linux/skbuff.h | 10 ++
 net/core/skbuff.c  |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 29c3ea5b6e93..17cee21c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -334,6 +334,13 @@ struct tc_skb_ext {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_TUN)
+struct tun_vnet_hash_ext {
+   u32 value;
+   u16 report;
+};
+#endif
+
 struct sk_buff_head {
/* These two members must be first to match sk_buff. */
struct_group_tagged(sk_buff_list, list,
@@ -4718,6 +4725,9 @@ enum skb_ext_id {
 #endif
 #if IS_ENABLED(CONFIG_MCTP_FLOWS)
SKB_EXT_MCTP,
+#endif
+#if IS_ENABLED(CONFIG_TUN)
+   SKB_EXT_TUN_VNET_HASH,
 #endif
SKB_EXT_NUM, /* must be last */
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83f8cd8aa2d1..ce34523fd8de 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4979,6 +4979,9 @@ static const u8 skb_ext_type_len[] = {
 #if IS_ENABLED(CONFIG_MCTP_FLOWS)
[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
 #endif
+#if IS_ENABLED(CONFIG_TUN)
+   [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct tun_vnet_hash_ext),
+#endif
 };
 
 static __always_inline unsigned int skb_ext_total_length(void)

-- 
2.46.0

[PATCH RFC v3 2/9] virtio_net: Add functions for hashing

2024-09-14 Thread Akihiko Odaki

They are useful to implement VIRTIO_NET_F_RSS and
VIRTIO_NET_F_HASH_REPORT.

Signed-off-by: Akihiko Odaki 
---
 include/linux/virtio_net.h | 198 +
 1 file changed, 198 insertions(+)

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 6c395a2600e8..7ee2e2f2625a 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -9,6 +9,183 @@
 #include 
 #include 
 
+struct virtio_net_hash {
+   u32 value;
+   u16 report;
+};
+
+struct virtio_net_toeplitz_state {
+   u32 hash;
+   u32 key_buffer;
+   const __be32 *key;
+};
+
+#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
+VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
+VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
+VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
+VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
+VIRTIO_NET_RSS_HASH_TYPE_UDPv6)
+
+#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
+
+static inline void virtio_net_toeplitz(struct virtio_net_toeplitz_state *state,
+  const __be32 *input, size_t len)
+{
+   u32 key;
+
+   while (len) {
+   state->key++;
+   key = be32_to_cpu(*state->key);
+
+   for (u32 bit = BIT(31); bit; bit >>= 1) {
+   if (be32_to_cpu(*input) & bit)
+   state->hash ^= state->key_buffer;
+
+   state->key_buffer =
+   (state->key_buffer << 1) | !!(key & bit);
+   }
+
+   input++;
+   len--;
+   }
+}
+
+static inline u8 virtio_net_hash_key_length(u32 types)
+{
+   size_t len = 0;
+
+   if (types & VIRTIO_NET_HASH_REPORT_IPv4)
+   len = max(len,
+ sizeof(struct flow_dissector_key_ipv4_addrs));
+
+   if (types &
+   (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4))
+   len = max(len,
+ sizeof(struct flow_dissector_key_ipv4_addrs) +
+ sizeof(struct flow_dissector_key_ports));
+
+   if (types & VIRTIO_NET_HASH_REPORT_IPv6)
+   len = max(len,
+ sizeof(struct flow_dissector_key_ipv6_addrs));
+
+   if (types &
+   (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6))
+   len = max(len,
+ sizeof(struct flow_dissector_key_ipv6_addrs) +
+ sizeof(struct flow_dissector_key_ports));
+
+   return 4 + len;
+}
+
+static inline u32 virtio_net_hash_report(u32 types,
+struct flow_dissector_key_basic key)
+{
+   switch (key.n_proto) {
+   case htons(ETH_P_IP):
+   if (key.ip_proto == IPPROTO_TCP &&
+   (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4))
+   return VIRTIO_NET_HASH_REPORT_TCPv4;
+
+   if (key.ip_proto == IPPROTO_UDP &&
+   (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4))
+   return VIRTIO_NET_HASH_REPORT_UDPv4;
+
+   if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
+   return VIRTIO_NET_HASH_REPORT_IPv4;
+
+   return VIRTIO_NET_HASH_REPORT_NONE;
+
+   case htons(ETH_P_IPV6):
+   if (key.ip_proto == IPPROTO_TCP &&
+   (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6))
+   return VIRTIO_NET_HASH_REPORT_TCPv6;
+
+   if (key.ip_proto == IPPROTO_UDP &&
+   (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6))
+   return VIRTIO_NET_HASH_REPORT_UDPv6;
+
+   if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
+   return VIRTIO_NET_HASH_REPORT_IPv6;
+
+   return VIRTIO_NET_HASH_REPORT_NONE;
+
+   default:
+   return VIRTIO_NET_HASH_REPORT_NONE;
+   }
+}
+
+static inline bool virtio_net_hash_rss(const struct sk_buff *skb,
+  u32 types, const __be32 *key,
+  struct virtio_net_hash *hash)
+{
+   u16 report;
+   struct virtio_net_toeplitz_state toeplitz_state = {
+   .key_buffer = be32_to_cpu(*key),
+   .key = key
+   };
+   struct flow_keys flow;
+
+   if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
+   return false;
+
+   report = virtio_net_hash_report(types, flow.basic);
+
+   switch (report) {
+   case VIRTIO_NET_HASH_REPORT_IPv4:
+   virtio_net_toeplitz(&toeplitz_state,
+   (__be32 *)&flow.addrs.v4addrs,
+   sizeof(flow.addrs.v4addrs) / 4);
+   break;
+
+   case VIRTIO_NET_HASH_REPORT_TCPv4:
+

[PATCH RFC v3 3/9] net: flow_dissector: Export flow_keys_dissector_symmetric

2024-09-14 Thread Akihiko Odaki

flow_keys_dissector_symmetric is useful to derive a symmetric hash
and to know its source such as IPv4, IPv6, TCP, and UDP.

Signed-off-by: Akihiko Odaki 
---
 include/net/flow_dissector.h | 1 +
 net/core/flow_dissector.c| 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index ced79dc8e856..d01c1ec77b7d 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -423,6 +423,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow);
 __be32 flow_get_u32_dst(const struct flow_keys *flow);
 
 extern struct flow_dissector flow_keys_dissector;
+extern struct flow_dissector flow_keys_dissector_symmetric;
 extern struct flow_dissector flow_keys_basic_dissector;
 
 /* struct flow_keys_digest:
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0e638a37aa09..9822988f2d49 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1852,7 +1852,8 @@ void make_flow_keys_digest(struct flow_keys_digest 
*digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
-static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
+struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector_symmetric);
 
 u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff 
*skb)
 {

-- 
2.46.0

[PATCH RFC v3 4/9] tap: Pad virtio header with zero

2024-09-14 Thread Akihiko Odaki

tap used to simply advance iov_iter when it needs to pad virtio header.
This leaves the garbage in the buffer as is and prevents telling if the
header is padded or contains some real data.

In theory, a user of tap can fill the buffer with zero before calling
read() to avoid such a problem, but leaving the garbage in the buffer is
awkward anyway so fill the buffer in tap.

Signed-off-by: Akihiko Odaki 
---
 drivers/net/tap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 77574f7a3bd4..ba044302ccc6 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -813,7 +813,7 @@ static ssize_t tap_put_user(struct tap_queue *q,
sizeof(vnet_hdr))
return -EFAULT;
 
-   iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr));
+   iov_iter_zero(vnet_hdr_len - sizeof(vnet_hdr), iter);
}
total = vnet_hdr_len;
total += skb->len;

-- 
2.46.0

[PATCH RFC v3 5/9] tun: Pad virtio header with zero

2024-09-14 Thread Akihiko Odaki

tun used to simply advance iov_iter when it needs to pad virtio header.
This leaves the garbage in the buffer as is and prevents telling if the
header is padded or contains some real data.

In theory, a user of tun can fill the buffer with zero before calling
read() to avoid such a problem, but leaving the garbage in the buffer is
awkward anyway so fill the buffer in tun.

Signed-off-by: Akihiko Odaki 
---
 drivers/net/tun.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1d06c560c5e6..9d93ab9ee58f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2073,7 +2073,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
 sizeof(gso)))
return -EFAULT;
-   iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
+   iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter);
}
 
ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
@@ -2146,7 +2146,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
return -EFAULT;
 
-   iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
+   iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter);
}
 
if (vlan_hlen) {

-- 
2.46.0

[PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting feature

2024-09-14 Thread Akihiko Odaki

Allow the guest to reuse the hash value to make receive steering
consistent between the host and guest, and to save hash computation.

Signed-off-by: Akihiko Odaki 
---
 Documentation/networking/tuntap.rst |   7 ++
 drivers/net/Kconfig |   1 +
 drivers/net/tun.c   | 146 +++-
 include/uapi/linux/if_tun.h |  44 +++
 4 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/Documentation/networking/tuntap.rst 
b/Documentation/networking/tuntap.rst
index 4d7087f727be..86b4ae8caa8a 100644
--- a/Documentation/networking/tuntap.rst
+++ b/Documentation/networking/tuntap.rst
@@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
   return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
   }
 
+3.4 Reference
+-
+
+``linux/if_tun.h`` defines the interface described below:
+
+.. kernel-doc:: include/uapi/linux/if_tun.h
+
 Universal TUN/TAP device driver Frequently Asked Question
 =
 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9920b3a68ed1..e2a7bd703550 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -395,6 +395,7 @@ config TUN
tristate "Universal TUN/TAP device driver support"
depends on INET
select CRC32
+   select SKB_EXTENSIONS
help
  TUN/TAP provides packet reception and transmission for user space
  programs.  It can be viewed as a simple Point-to-Point or Ethernet
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9d93ab9ee58f..b8fcd71becac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -173,6 +173,10 @@ struct tun_prog {
struct bpf_prog *prog;
 };
 
+struct tun_vnet_hash_container {
+   struct tun_vnet_hash common;
+};
+
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
  * device, socket filter, sndbuf and vnet header size were restore when the
  * file were attached to a persist device.
@@ -210,6 +214,7 @@ struct tun_struct {
struct bpf_prog __rcu *xdp_prog;
struct tun_prog __rcu *steering_prog;
struct tun_prog __rcu *filter_prog;
+   struct tun_vnet_hash_container __rcu *vnet_hash;
struct ethtool_link_ksettings link_ksettings;
/* init args */
struct file *file;
@@ -221,6 +226,11 @@ struct veth {
__be16 h_vlan_TCI;
 };
 
+static const struct tun_vnet_hash tun_vnet_hash_cap = {
+   .flags = TUN_VNET_HASH_REPORT,
+   .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
+};
+
 static void tun_flow_init(struct tun_struct *tun);
 static void tun_flow_uninit(struct tun_struct *tun);
 
@@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int 
__user *argp)
if (get_user(be, argp))
return -EFAULT;
 
-   if (be)
+   if (be) {
+   struct tun_vnet_hash_container *vnet_hash = 
rtnl_dereference(tun->vnet_hash);
+
+   if (!(tun->flags & TUN_VNET_LE) &&
+   vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT))
+   return -EBUSY;
+
tun->flags |= TUN_VNET_BE;
-   else
+   } else {
tun->flags &= ~TUN_VNET_BE;
+   }
 
return 0;
 }
@@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct 
tun_flow_entry *e, u32 hash)
  * the userspace application move between processors, we may get a
  * different rxq no. here.
  */
-static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb,
+  const struct tun_vnet_hash_container 
*vnet_hash)
 {
+   struct tun_vnet_hash_ext *ext;
+   struct flow_keys keys;
struct tun_flow_entry *e;
u32 txq, numqueues;
 
numqueues = READ_ONCE(tun->numqueues);
 
-   txq = __skb_get_hash_symmetric(skb);
+   memset(&keys, 0, sizeof(keys));
+   skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
+
+   txq = flow_hash_from_keys(&keys);
e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
if (e) {
tun_flow_save_rps_rxhash(e, txq);
@@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, 
struct sk_buff *skb)
txq = reciprocal_scale(txq, numqueues);
}
 
+   if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) {
+   ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
+   if (ext) {
+   u32 types = vnet_hash->common.types;
+
+   ext->report = virtio_net_hash_report(types, keys.basic);
+   ext->value = skb->l4_hash ? skb->hash : txq;
+   }
+   }
+
return txq;
 }
 
@@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, 
struct sk_buff *skb,
u16 ret;
 
rcu_rea

[PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

2024-09-14 Thread Akihiko Odaki

RSS is a receive steering algorithm that can be negotiated to use with
virtio_net. Conventionally the hash calculation was done by the VMM.
However, computing the hash after the queue was chosen defeats the
purpose of RSS.

Another approach is to use eBPF steering program. This approach has
another downside: it cannot report the calculated hash due to the
restrictive nature of eBPF steering program.

Introduce the code to perform RSS to the kernel in order to overcome
thse challenges. An alternative solution is to extend the eBPF steering
program so that it will be able to report to the userspace, but I didn't
opt for it because extending the current mechanism of eBPF steering
program as is because it relies on legacy context rewriting, and
introducing kfunc-based eBPF will result in non-UAPI dependency while
the other relevant virtualization APIs such as KVM and vhost_net are
UAPIs.

Signed-off-by: Akihiko Odaki 
---
 drivers/net/tun.c   | 119 +++-
 include/uapi/linux/if_tun.h |  27 ++
 2 files changed, 133 insertions(+), 13 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b8fcd71becac..5a429b391144 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,6 +175,9 @@ struct tun_prog {
 
 struct tun_vnet_hash_container {
struct tun_vnet_hash common;
+   struct tun_vnet_hash_rss rss;
+   __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
+   u16 rss_indirection_table[];
 };
 
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
@@ -227,7 +230,7 @@ struct veth {
 };
 
 static const struct tun_vnet_hash tun_vnet_hash_cap = {
-   .flags = TUN_VNET_HASH_REPORT,
+   .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
 };
 
@@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, 
struct sk_buff *skb)
return ret % numqueues;
 }
 
+static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
+struct sk_buff *skb,
+const struct tun_vnet_hash_container 
*vnet_hash)
+{
+   struct tun_vnet_hash_ext *ext;
+   struct virtio_net_hash hash;
+   u32 numqueues = READ_ONCE(tun->numqueues);
+   u16 txq, index;
+
+   if (!numqueues)
+   return 0;
+
+   if (!virtio_net_hash_rss(skb, vnet_hash->common.types, 
vnet_hash->rss_key,
+&hash))
+   return vnet_hash->rss.unclassified_queue % numqueues;
+
+   if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
+   ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
+   if (ext) {
+   ext->value = hash.value;
+   ext->report = hash.report;
+   }
+   }
+
+   index = hash.value & vnet_hash->rss.indirection_table_mask;
+   txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
+
+   return txq % numqueues;
+}
+
 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev)
 {
@@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct 
sk_buff *skb,
} else {
struct tun_vnet_hash_container *vnet_hash = 
rcu_dereference(tun->vnet_hash);
 
-   ret = tun_automq_select_queue(tun, skb, vnet_hash);
+   if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
+   ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
+   else
+   ret = tun_automq_select_queue(tun, skb, vnet_hash);
}
rcu_read_unlock();
 
@@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq 
*ifr)
 }
 
 static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
-   void __user *data)
+   int fd)
 {
struct bpf_prog *prog;
-   int fd;
-
-   if (copy_from_user(&fd, data, sizeof(fd)))
-   return -EFAULT;
 
if (fd == -1) {
prog = NULL;
@@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
int ifindex;
int sndbuf;
int vnet_hdr_sz;
+   int fd;
int le;
int ret;
bool do_notify = false;
@@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
int cmd,
break;
 
case TUNSETSTEERINGEBPF:
-   ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
+   if (get_user(fd, (int __user *)argp)) {
+   ret = -EFAULT;
+   break;
+   }
+
+   vnet_hash = rtnl_dereference(tun->vnet_hash);
+   if (fd != -1 && vnet_hash && (vnet_hash->common.flags & 
TUN_VNET_HASH_RSS)) {
+   ret = -EBUSY;
+   break;
+

[PATCH RFC v3 8/9] selftest: tun: Add tests for virtio-net hashing

2024-09-14 Thread Akihiko Odaki

The added tests confirm tun can perform RSS and hash reporting, and
reject invalid configurations for them.

Signed-off-by: Akihiko Odaki 
---
 tools/testing/selftests/net/Makefile |   2 +-
 tools/testing/selftests/net/tun.c| 666 ++-
 2 files changed, 660 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 8eaffd7a641c..5629e68bf69d 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -109,6 +109,6 @@ $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
 $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
 $(OUTPUT)/bind_bhash: LDLIBS += -lpthread
-$(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/
+$(OUTPUT)/io_uring_zerocopy_tx $(OUTPUT)/tun: CFLAGS += -I../../../include/
 
 include bpf.mk
diff --git a/tools/testing/selftests/net/tun.c 
b/tools/testing/selftests/net/tun.c
index fa83918b62d1..f46affa39d5c 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -2,21 +2,37 @@
 
 #define _GNU_SOURCE
 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
-#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 #include 
+#include 
 #include 
 #include 
-#include 
-#include 
+#include 
+#include 
+#include 
+#include 
 
 #include "../kselftest_harness.h"
 
+#define TUN_HWADDR_SOURCE { 0x02, 0x00, 0x00, 0x00, 0x00, 0x00 }
+#define TUN_HWADDR_DEST { 0x02, 0x00, 0x00, 0x00, 0x00, 0x01 }
+#define TUN_IPADDR_SOURCE htonl((172 << 24) | (17 << 16) | 0)
+#define TUN_IPADDR_DEST htonl((172 << 24) | (17 << 16) | 1)
+
 static int tun_attach(int fd, char *dev)
 {
struct ifreq ifr;
@@ -39,7 +55,7 @@ static int tun_detach(int fd, char *dev)
return ioctl(fd, TUNSETQUEUE, (void *) &ifr);
 }
 
-static int tun_alloc(char *dev)
+static int tun_alloc(char *dev, short flags)
 {
struct ifreq ifr;
int fd, err;
@@ -52,7 +68,8 @@ static int tun_alloc(char *dev)
 
memset(&ifr, 0, sizeof(ifr));
strcpy(ifr.ifr_name, dev);
-   ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE;
+   ifr.ifr_flags = flags | IFF_TAP | IFF_NAPI | IFF_NO_PI |
+   IFF_MULTI_QUEUE;
 
err = ioctl(fd, TUNSETIFF, (void *) &ifr);
if (err < 0) {
@@ -64,6 +81,40 @@ static int tun_alloc(char *dev)
return fd;
 }
 
+static bool tun_add_to_bridge(int local_fd, const char *name)
+{
+   struct ifreq ifreq = {
+   .ifr_name = "xbridge",
+   .ifr_ifindex = if_nametoindex(name)
+   };
+
+   if (!ifreq.ifr_ifindex) {
+   perror("if_nametoindex");
+   return false;
+   }
+
+   if (ioctl(local_fd, SIOCBRADDIF, &ifreq)) {
+   perror("SIOCBRADDIF");
+   return false;
+   }
+
+   return true;
+}
+
+static bool tun_set_flags(int local_fd, const char *name, short flags)
+{
+   struct ifreq ifreq = { .ifr_flags = flags };
+
+   strcpy(ifreq.ifr_name, name);
+
+   if (ioctl(local_fd, SIOCSIFFLAGS, &ifreq)) {
+   perror("SIOCSIFFLAGS");
+   return false;
+   }
+
+   return true;
+}
+
 static int tun_delete(char *dev)
 {
struct {
@@ -102,6 +153,159 @@ static int tun_delete(char *dev)
return ret;
 }
 
+static uint32_t tun_sum(const void *buf, size_t len)
+{
+   const uint16_t *sbuf = buf;
+   uint32_t sum = 0;
+
+   while (len > 1) {
+   sum += *sbuf++;
+   len -= 2;
+   }
+
+   if (len)
+   sum += *(uint8_t *)sbuf;
+
+   return sum;
+}
+
+static uint16_t tun_build_ip_check(uint32_t sum)
+{
+   return ~((sum & 0x) + (sum >> 16));
+}
+
+static uint32_t tun_build_ip_pseudo_sum(const void *iphdr)
+{
+   uint16_t tot_len = ntohs(((struct iphdr *)iphdr)->tot_len);
+
+   return tun_sum((char *)iphdr + offsetof(struct iphdr, saddr), 8) +
+  htons(((struct iphdr *)iphdr)->protocol) +
+  htons(tot_len - sizeof(struct iphdr));
+}
+
+static uint32_t tun_build_ipv6_pseudo_sum(const void *ipv6hdr)
+{
+   return tun_sum((char *)ipv6hdr + offsetof(struct ipv6hdr, saddr), 32) +
+  ((struct ipv6hdr *)ipv6hdr)->payload_len +
+  htons(((struct ipv6hdr *)ipv6hdr)->nexthdr);
+}
+
+static void tun_build_ethhdr(struct ethhdr *ethhdr, uint16_t proto)
+{
+   *ethhdr = (struct ethhdr) {
+   .h_dest = TUN_HWADDR_DEST,
+   .h_source = TUN_HWADDR_SOURCE,
+   .h_proto = htons(proto)
+   };
+}
+
+static void tun_build_iphdr(void *dest, uint16_t len, uint8_t protocol)
+{
+   struct iphdr iphdr = {
+   .ihl = sizeof(iphdr) / 4,
+   .version = 4,
+   .tot_len = htons(sizeof(iphdr) + len),
+   .ttl = 255,
+   .protocol = protocol,
+

[PATCH RFC v3 9/9] vhost/net: Support VIRTIO_NET_F_HASH_REPORT

2024-09-14 Thread Akihiko Odaki

VIRTIO_NET_F_HASH_REPORT allows to report hash values calculated on the
host. When VHOST_NET_F_VIRTIO_NET_HDR is employed, it will report no
hash values (i.e., the hash_report member is always set to
VIRTIO_NET_HASH_REPORT_NONE). Otherwise, the values reported by the
underlying socket will be reported.

VIRTIO_NET_F_HASH_REPORT requires VIRTIO_F_VERSION_1.

Signed-off-by: Akihiko Odaki 
---
 drivers/vhost/net.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f16279351db5..ec1167a782ec 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -73,6 +73,7 @@ enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+(1ULL << VIRTIO_NET_F_HASH_REPORT) |
 (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
 (1ULL << VIRTIO_F_RING_RESET)
 };
@@ -1604,10 +1605,13 @@ static int vhost_net_set_features(struct vhost_net *n, 
u64 features)
size_t vhost_hlen, sock_hlen, hdr_len;
int i;
 
-   hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-  (1ULL << VIRTIO_F_VERSION_1))) ?
-   sizeof(struct virtio_net_hdr_mrg_rxbuf) :
-   sizeof(struct virtio_net_hdr);
+   if (features & (1ULL << VIRTIO_NET_F_HASH_REPORT))
+   hdr_len = sizeof(struct virtio_net_hdr_v1_hash);
+   else if (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+(1ULL << VIRTIO_F_VERSION_1)))
+   hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+   else
+   hdr_len = sizeof(struct virtio_net_hdr);
if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
/* vhost provides vnet_hdr */
vhost_hlen = hdr_len;
@@ -1688,6 +1692,10 @@ static long vhost_net_ioctl(struct file *f, unsigned int 
ioctl,
return -EFAULT;
if (features & ~VHOST_NET_FEATURES)
return -EOPNOTSUPP;
+   if ((features & ((1ULL << VIRTIO_F_VERSION_1) |
+(1ULL << VIRTIO_NET_F_HASH_REPORT))) ==
+   (1ULL << VIRTIO_NET_F_HASH_REPORT))
+   return -EINVAL;
return vhost_net_set_features(n, features);
case VHOST_GET_BACKEND_FEATURES:
features = VHOST_NET_BACKEND_FEATURES;

-- 
2.46.0

[PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature

[PATCH RFC v3 1/9] skbuff: Introduce SKB_EXT_TUN_VNET_HASH

[PATCH RFC v3 2/9] virtio_net: Add functions for hashing

[PATCH RFC v3 3/9] net: flow_dissector: Export flow_keys_dissector_symmetric

[PATCH RFC v3 4/9] tap: Pad virtio header with zero

[PATCH RFC v3 5/9] tun: Pad virtio header with zero

[PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting feature

[PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

[PATCH RFC v3 8/9] selftest: tun: Add tests for virtio-net hashing

[PATCH RFC v3 9/9] vhost/net: Support VIRTIO_NET_F_HASH_REPORT

10 matches

Site Navigation

Mail list logo

Footer information