Re: [PATCH net-next v9 3/6] tun: Introduce virtio-net hash feature

Willem de Bruijn Sat, 08 Mar 2025 11:32:43 -0800

Akihiko Odaki wrote:
> Hash reporting
> ==============
> 
> Allow the guest to reuse the hash value to make receive steering
> consistent between the host and guest, and to save hash computation.
> 
> RSS
> ===
> 
> RSS is a receive steering algorithm that can be negotiated to use with
> virtio_net. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
> 
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF steering program.
> 
> Introduce the code to perform RSS to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but I didn't
> opt for it because extending the current mechanism of eBPF steering
> program as is because it relies on legacy context rewriting, and
> introducing kfunc-based eBPF will result in non-UAPI dependency while
> the other relevant virtualization APIs such as KVM and vhost_net are
> UAPIs.
> 
> Signed-off-by: Akihiko Odaki <[email protected]>
> Tested-by: Lei Yang <[email protected]>
> ---
>  Documentation/networking/tuntap.rst |   7 ++
>  drivers/net/Kconfig                 |   1 +
>  drivers/net/tap.c                   |  68 ++++++++++++++-
>  drivers/net/tun.c                   |  98 +++++++++++++++++-----
>  drivers/net/tun_vnet.h              | 159 
> ++++++++++++++++++++++++++++++++++--
>  include/linux/if_tap.h              |   2 +
>  include/linux/skbuff.h              |   3 +
>  include/uapi/linux/if_tun.h         |  75 +++++++++++++++++
>  net/core/skbuff.c                   |   4 +
>  9 files changed, 386 insertions(+), 31 deletions(-)


This is arguably still doing too much in a single patch.

Can you split tap from tun? Move ioctl contrl operations out to their
own patch?

> 
> diff --git a/Documentation/networking/tuntap.rst 
> b/Documentation/networking/tuntap.rst
> index 
> 4d7087f727be5e37dfbf5066a9e9c872cc98898d..86b4ae8caa8ad062c1e558920be42ce0d4217465
>  100644
> --- a/Documentation/networking/tuntap.rst
> +++ b/Documentation/networking/tuntap.rst
> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
>        return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
>    }
>  
> +3.4 Reference
> +-------------
> +
> +``linux/if_tun.h`` defines the interface described below:
> +
> +.. kernel-doc:: include/uapi/linux/if_tun.h
> +
>  Universal TUN/TAP device driver Frequently Asked Question
>  =========================================================
>  
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 
> 1fd5acdc73c6af0e1a861867039c3624fc618e25..aecfd244dd83585fea2c5b815dcd787c58166c28
>  100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -395,6 +395,7 @@ config TUN
>       tristate "Universal TUN/TAP device driver support"
>       depends on INET
>       select CRC32
> +     select SKB_EXTENSIONS
>       help
>         TUN/TAP provides packet reception and transmission for user space
>         programs.  It can be viewed as a simple Point-to-Point or Ethernet
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 
> d4ece538f1b23789ca60caa6232690e4d0a4d14a..9428b63ec27e7f92e78a78afcb5e24383862c00d
>  100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -49,6 +49,10 @@ struct major_info {
>       struct list_head next;
>  };
>  
> +struct tap_skb_cb {
> +     struct virtio_net_hash hash;
> +};
> +
>  #define GOODCOPY_LEN 128
>  
>  static const struct proto_ops tap_socket_ops;
> @@ -179,6 +183,22 @@ static void tap_put_queue(struct tap_queue *q)
>       sock_put(&q->sk);
>  }
>  
> +static struct tap_skb_cb *tap_skb_cb(const struct sk_buff *skb)
> +{
> +     BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct tap_skb_cb));
> +     return (struct tap_skb_cb *)skb->cb;
> +}
> +
> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
> +{
> +     return &tap_skb_cb(skb)->hash;
> +}
> +
> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
> +{
> +     return &tap_skb_cb(skb)->hash;
> +}
> +

These two helpers do the same thing.

>  /*
>   * Select a queue based on the rxq of the device on which this packet
>   * arrived. If the incoming device is not mq, calculate a flow hash
> @@ -189,6 +209,7 @@ static void tap_put_queue(struct tap_queue *q)
>  static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>                                      struct sk_buff *skb)
>  {
> +     struct flow_keys_basic keys_basic;
>       struct tap_queue *queue = NULL;
>       /* Access to taps array is protected by rcu, but access to numvtaps
>        * isn't. Below we use it to lookup a queue, but treat it as a hint
> @@ -196,17 +217,47 @@ static struct tap_queue *tap_get_queue(struct tap_dev 
> *tap,
>        * racing against queue removal.
>        */
>       int numvtaps = READ_ONCE(tap->numvtaps);
> +     struct tun_vnet_hash_container *vnet_hash = 
> rcu_dereference(tap->vnet_hash);
>       __u32 rxq;
>  
> +     *tap_skb_cb(skb) = (struct tap_skb_cb) {
> +             .hash = { .report = VIRTIO_NET_HASH_REPORT_NONE }
> +     };
> +
>       if (!numvtaps)
>               goto out;
>  
>       if (numvtaps == 1)
>               goto single;
>  
> +     if (vnet_hash) {
> +             if ((vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> +                     rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, 
> skb, tap_add_hash);
> +                     queue = rcu_dereference(tap->taps[rxq]);
> +                     goto out;

so tun_vnet_hash_report does not work in this case?

> +             }
> +
> +             if (!skb->l4_hash && !skb->sw_hash) {
> +                     struct flow_keys keys;
> +
> +                     skb_flow_dissect_flow_keys(skb, &keys, 
> FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> +                     rxq = flow_hash_from_keys(&keys);
> +                     keys_basic = (struct flow_keys_basic) {
> +                             .control = keys.control,
> +                             .basic = keys.basic
> +                     };
> +             } else {
> +                     skb_flow_dissect_flow_keys_basic(NULL, skb, 
> &keys_basic, NULL, 0, 0, 0,
> +                                                      
> FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> +                     rxq = skb->hash;
> +             }
> +     } else {
> +             rxq = skb_get_hash(skb);
> +     }
> +
>       /* Check if we can use flow to select a queue */
> -     rxq = skb_get_hash(skb);
>       if (rxq) {
> +             tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, 
> tap_add_hash);
>               queue = rcu_dereference(tap->taps[rxq % numvtaps]);
>               goto out;
>       }
> @@ -711,11 +762,12 @@ static ssize_t tap_put_user(struct tap_queue *q,
>       int total;
>  
>       if (q->flags & IFF_VNET_HDR) {
> -             struct virtio_net_hdr vnet_hdr;
> +             struct virtio_net_hdr_v1_hash vnet_hdr;
>  
>               vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
>  
> -             ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
> +             ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
> +                                         tap_find_hash, &vnet_hdr);
>               if (ret)
>                       return ret;
>  
> @@ -992,6 +1044,16 @@ static long tap_ioctl(struct file *file, unsigned int 
> cmd,
>               rtnl_unlock();
>               return ret;
>  
> +     case TUNGETVNETHASHCAP:
> +             return tun_vnet_ioctl_gethashcap(argp);
> +
> +     case TUNSETVNETHASH:
> +             rtnl_lock();
> +             tap = rtnl_dereference(q->tap);
> +             ret = tap ? tun_vnet_ioctl_sethash(&tap->vnet_hash, true, argp) 
> : -EBADFD;
> +             rtnl_unlock();
> +             return ret;
> +
>       case SIOCGIFHWADDR:
>               rtnl_lock();
>               tap = tap_get_tap_dev(q);
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 
> d8f4d3e996a7a81d1f8b04635054081671a14f07..520013df416e93d3a50b46be9b53ae9ab410eab4
>  100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -209,6 +209,7 @@ struct tun_struct {
>       struct bpf_prog __rcu *xdp_prog;
>       struct tun_prog __rcu *steering_prog;
>       struct tun_prog __rcu *filter_prog;
> +     struct tun_vnet_hash_container __rcu *vnet_hash;
>       struct ethtool_link_ksettings link_ksettings;
>       /* init args */
>       struct file *file;
> @@ -451,20 +452,37 @@ static inline void tun_flow_save_rps_rxhash(struct 
> tun_flow_entry *e, u32 hash)
>               e->rps_rxhash = hash;
>  }
>  
> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
> +{
> +     return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> +}
> +
> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
> +{
> +     return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
> +}
> +
>  /* We try to identify a flow through its rxhash. The reason that
>   * we do not check rxq no. is because some cards(e.g 82599), chooses
>   * the rxq based on the txq where the last packet of the flow comes. As
>   * the userspace application move between processors, we may get a
>   * different rxq no. here.
>   */
> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff 
> *skb)
> +static u16 tun_automq_select_queue(struct tun_struct *tun,
> +                                const struct tun_vnet_hash_container 
> *vnet_hash,
> +                                struct sk_buff *skb)
>  {
> +     struct flow_keys keys;
> +     struct flow_keys_basic keys_basic;
>       struct tun_flow_entry *e;
>       u32 txq, numqueues;
>  
>       numqueues = READ_ONCE(tun->numqueues);
>  
> -     txq = __skb_get_hash_symmetric(skb);
> +     memset(&keys, 0, sizeof(keys));
> +     skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
> +
> +     txq = flow_hash_from_keys(&keys);
>       e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
>       if (e) {
>               tun_flow_save_rps_rxhash(e, txq);
> @@ -473,6 +491,13 @@ static u16 tun_automq_select_queue(struct tun_struct 
> *tun, struct sk_buff *skb)
>               txq = reciprocal_scale(txq, numqueues);
>       }
>  
> +     keys_basic = (struct flow_keys_basic) {
> +             .control = keys.control,
> +             .basic = keys.basic
> +     };
> +     tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? 
> skb->hash : txq,
> +                          tun_add_hash);
> +
>       return txq;
>  }
>  
> @@ -500,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, 
> struct sk_buff *skb,
>       u16 ret;
>  
>       rcu_read_lock();
> -     if (rcu_dereference(tun->steering_prog))
> +     if (rcu_dereference(tun->steering_prog)) {
>               ret = tun_ebpf_select_queue(tun, skb);
> -     else
> -             ret = tun_automq_select_queue(tun, skb);
> +     } else {
> +             struct tun_vnet_hash_container *vnet_hash = 
> rcu_dereference(tun->vnet_hash);
> +
> +             if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> +                     ret = 
> tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash,
> +                                                     skb, tun_add_hash);
> +             else
> +                     ret = tun_automq_select_queue(tun, vnet_hash, skb);
> +     }
>       rcu_read_unlock();
>  
>       return ret;
> @@ -1987,7 +2019,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
>       ssize_t ret;
>  
>       if (tun->flags & IFF_VNET_HDR) {
> -             struct virtio_net_hdr gso = { 0 };
> +             struct virtio_net_hdr_v1_hash gso = { 0 };
>  
>               vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
>               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
> @@ -2040,9 +2072,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>       }
>  
>       if (vnet_hdr_sz) {
> -             struct virtio_net_hdr gso;
> +             struct virtio_net_hdr_v1_hash gso;
>  
> -             ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
> +             ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev,
> +                                         skb, tun_find_hash, &gso);
>               if (ret)
>                       return ret;
>  
> @@ -2223,6 +2256,7 @@ static void tun_free_netdev(struct net_device *dev)
>       security_tun_dev_free_security(tun->security);
>       __tun_set_ebpf(tun, &tun->steering_prog, NULL);
>       __tun_set_ebpf(tun, &tun->filter_prog, NULL);
> +     kfree_rcu_mightsleep(rcu_access_pointer(tun->vnet_hash));
>  }
>  
>  static void tun_setup(struct net_device *dev)
> @@ -2921,13 +2955,9 @@ static int tun_set_queue(struct file *file, struct 
> ifreq *ifr)
>  }
>  
>  static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu 
> **prog_p,
> -                     void __user *data)
> +                     int fd)
>  {
>       struct bpf_prog *prog;
> -     int fd;
> -
> -     if (copy_from_user(&fd, data, sizeof(fd)))
> -             return -EFAULT;
>  
>       if (fd == -1) {
>               prog = NULL;
> @@ -2993,7 +3023,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
> int cmd,
>       int ifindex;
>       int sndbuf;
>       int ret;
> +     int fd;
>       bool do_notify = false;
> +     struct tun_vnet_hash_container *vnet_hash;
>  
>       if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>           (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
> @@ -3020,7 +3052,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
> int cmd,
>       rtnl_lock();
>  
>       tun = tun_get(tfile);
> -     if (cmd == TUNSETIFF) {
> +     switch (cmd) {
> +     case TUNSETIFF:
>               ret = -EEXIST;
>               if (tun)
>                       goto unlock;
> @@ -3035,8 +3068,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
> int cmd,
>               if (copy_to_user(argp, &ifr, ifreq_len))
>                       ret = -EFAULT;
>               goto unlock;
> -     }
> -     if (cmd == TUNSETIFINDEX) {
> +
> +     case TUNSETIFINDEX:
>               ret = -EPERM;
>               if (tun)
>                       goto unlock;
> @@ -3050,6 +3083,10 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>               ret = 0;
>               tfile->ifindex = ifindex;
>               goto unlock;
> +
> +     case TUNGETVNETHASHCAP:
> +             ret = tun_vnet_ioctl_gethashcap(argp);
> +             goto unlock;
>       }
>  
>       ret = -EBADFD;
> @@ -3230,11 +3267,27 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>               break;
>  
>       case TUNSETSTEERINGEBPF:
> -             ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +             if (get_user(fd, (int __user *)argp)) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +
> +             vnet_hash = rtnl_dereference(tun->vnet_hash);
> +             if (fd != -1 && vnet_hash && (vnet_hash->common.flags & 
> TUN_VNET_HASH_RSS)) {
> +                     ret = -EBUSY;
> +                     break;
> +             }
> +
> +             ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>               break;
>  
>       case TUNSETFILTEREBPF:
> -             ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +             if (get_user(fd, (int __user *)argp)) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +
> +             ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>               break;
>  
>       case TUNSETCARRIER:
> @@ -3252,8 +3305,15 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>               ret = open_related_ns(&net->ns, get_net_ns);
>               break;
>  
> +     case TUNSETVNETHASH:
> +             ret = tun_vnet_ioctl_sethash(&tun->vnet_hash,
> +                                          
> !rtnl_dereference(tun->steering_prog),
> +                                          argp);
> +             break;
> +
>       default:
> -             ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
> +             ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
> +                                  cmd, argp);

no need to touch this

>               break;
>       }
>  
> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
> index 
> 58b9ac7a5fc4084c789fe94fe36b5f8631bf1fa4..8e7d51fb0b4742cef56e7c5ad778b156cc654bed
>  100644
> --- a/drivers/net/tun_vnet.h
> +++ b/drivers/net/tun_vnet.h
> @@ -6,6 +6,16 @@
>  #define TUN_VNET_LE     0x80000000
>  #define TUN_VNET_BE     0x40000000
>  
> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct 
> sk_buff *);
> +
> +struct tun_vnet_hash_container {
> +     struct tun_vnet_hash common;
> +     struct tun_vnet_hash_rss rss;
> +     u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> +     u16 rss_indirection_table[];
> +};
> +
>  static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
>  {
>       bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) &&
> @@ -107,6 +117,123 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, 
> unsigned int *flags,
>       }
>  }
>  
> +static inline long tun_vnet_ioctl_gethashcap(void __user *argp)
> +{
> +     static const struct tun_vnet_hash cap = {
> +             .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
> +             .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> +     };
> +
> +     return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
> +}
> +
> +static inline long tun_vnet_ioctl_sethash(struct tun_vnet_hash_container 
> __rcu **hashp,
> +                                       bool can_rss, void __user *argp)
> +{
> +     struct tun_vnet_hash hash_buf;
> +     struct tun_vnet_hash_container *hash;
> +
> +     if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
> +             return -EFAULT;
> +     argp = (struct tun_vnet_hash __user *)argp + 1;
> +
> +     if (hash_buf.flags & TUN_VNET_HASH_RSS) {
> +             struct tun_vnet_hash_rss rss;
> +             size_t indirection_table_size;
> +             size_t key_size;
> +             size_t size;
> +
> +             if (!can_rss)
> +                     return -EBUSY;
> +
> +             if (copy_from_user(&rss, argp, sizeof(rss)))
> +                     return -EFAULT;
> +             argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> +
> +             indirection_table_size = ((size_t)rss.indirection_table_mask + 
> 1) * 2;
> +             key_size = virtio_net_hash_key_length(hash_buf.types);
> +             size = struct_size(hash, rss_indirection_table,
> +                                (size_t)rss.indirection_table_mask + 1);
> +
> +             hash = kmalloc(size, GFP_KERNEL);
> +             if (!hash)
> +                     return -ENOMEM;
> +
> +             if (copy_from_user(hash->rss_indirection_table,
> +                                argp, indirection_table_size)) {
> +                     kfree(hash);
> +                     return -EFAULT;
> +             }
> +             argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
> +
> +             if (copy_from_user(hash->rss_key, argp, key_size)) {
> +                     kfree(hash);
> +                     return -EFAULT;
> +             }
> +
> +             virtio_net_toeplitz_convert_key(hash->rss_key, key_size);
> +             hash->rss = rss;
> +     } else {
> +             hash = kmalloc(sizeof(hash->common), GFP_KERNEL);
> +             if (!hash)
> +                     return -ENOMEM;
> +     }
> +
> +     hash->common = hash_buf;
> +     kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash));
> +     return 0;
> +}
> +
> +static void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash,
> +                              struct sk_buff *skb,
> +                              const struct flow_keys_basic *keys,
> +                              u32 value,
> +                              tun_vnet_hash_add vnet_hash_add)
> +{
> +     struct virtio_net_hash *report;
> +
> +     if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT))
> +             return;
> +
> +     report = vnet_hash_add(skb);
> +     if (!report)
> +             return;
> +
> +     *report = (struct virtio_net_hash) {
> +             .report = virtio_net_hash_report(hash->common.types, keys),
> +             .value = value
> +     };
> +}
> +
> +static u16 tun_vnet_rss_select_queue(u32 numqueues,
> +                                  const struct tun_vnet_hash_container *hash,
> +                                  struct sk_buff *skb,
> +                                  tun_vnet_hash_add vnet_hash_add)
> +{
> +     struct virtio_net_hash *report;
> +     struct virtio_net_hash ret;
> +     u16 txq, index;
> +
> +     if (!numqueues)
> +             return 0;
> +
> +     virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret);
> +
> +     if (!ret.report)
> +             return hash->rss.unclassified_queue % numqueues;
> +
> +     if (hash->common.flags & TUN_VNET_HASH_REPORT) {
> +             report = vnet_hash_add(skb);
> +             if (report)
> +                     *report = ret;
> +     }
> +
> +     index = ret.value & hash->rss.indirection_table_mask;
> +     txq = READ_ONCE(hash->rss_indirection_table[index]);
> +
> +     return txq % numqueues;
> +}
> +
>  static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>                                  struct iov_iter *from,
>                                  struct virtio_net_hdr *hdr)
> @@ -135,15 +262,17 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int 
> flags,
>  }
>  
>  static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
> -                                const struct virtio_net_hdr *hdr)
> +                                const struct virtio_net_hdr_v1_hash *hdr)
>  {
> +     int content_sz = MIN(sizeof(*hdr), sz);
> +
>       if (unlikely(iov_iter_count(iter) < sz))
>               return -EINVAL;
>  
> -     if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
> +     if (unlikely(copy_to_iter(hdr, content_sz, iter) != content_sz))
>               return -EFAULT;
>  
> -     if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
> +     if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz)
>               return -EFAULT;
>  
>       return 0;
> @@ -155,26 +284,38 @@ static inline int tun_vnet_hdr_to_skb(unsigned int 
> flags, struct sk_buff *skb,
>       return virtio_net_hdr_to_skb(skb, hdr, 
> tun_vnet_is_little_endian(flags));
>  }
>  
> -static inline int tun_vnet_hdr_from_skb(unsigned int flags,
> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags,
>                                       const struct net_device *dev,
>                                       const struct sk_buff *skb,
> -                                     struct virtio_net_hdr *hdr)
> +                                     tun_vnet_hash_find vnet_hash_find,
> +                                     struct virtio_net_hdr_v1_hash *hdr)
>  {
>       int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
> +     const struct virtio_net_hash *report = sz < sizeof(struct 
> virtio_net_hdr_v1_hash) ?
> +                                            NULL : vnet_hash_find(skb);
> +
> +     *hdr = (struct virtio_net_hdr_v1_hash) {
> +             .hash_report = VIRTIO_NET_HASH_REPORT_NONE
> +     };
> +
> +     if (report) {
> +             hdr->hash_value = cpu_to_le32(report->value);
> +             hdr->hash_report = cpu_to_le16(report->report);
> +     }
>  
> -     if (virtio_net_hdr_from_skb(skb, hdr,
> +     if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
>                                   tun_vnet_is_little_endian(flags), true,
>                                   vlan_hlen)) {
>               struct skb_shared_info *sinfo = skb_shinfo(skb);
>  
>               if (net_ratelimit()) {
>                       netdev_err(dev, "unexpected GSO type: 0x%x, gso_size 
> %d, hdr_len %d\n",
> -                                sinfo->gso_type, tun_vnet16_to_cpu(flags, 
> hdr->gso_size),
> -                                tun_vnet16_to_cpu(flags, hdr->hdr_len));
> +                                sinfo->gso_type, tun_vnet16_to_cpu(flags, 
> hdr->hdr.gso_size),
> +                                tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len));
>                       print_hex_dump(KERN_ERR, "tun: ",
>                                      DUMP_PREFIX_NONE,
>                                      16, 1, skb->head,
> -                                    min(tun_vnet16_to_cpu(flags, 
> hdr->hdr_len), 64), true);
> +                                    min(tun_vnet16_to_cpu(flags, 
> hdr->hdr.hdr_len), 64), true);
>               }
>               WARN_ON_ONCE(1);
>               return -EINVAL;
> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
> index 
> 553552fa635c3e1e53d1a63c203d32e4c4fd5a4f..7334c46a3f101675a0d4e5a036987cfe18842f9f
>  100644
> --- a/include/linux/if_tap.h
> +++ b/include/linux/if_tap.h
> @@ -31,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file 
> *f)
>  #define MAX_TAP_QUEUES 256
>  
>  struct tap_queue;
> +struct tun_vnet_hash_container;
>  
>  struct tap_dev {
>       struct net_device       *dev;
> @@ -43,6 +44,7 @@ struct tap_dev {
>       int                     numqueues;
>       netdev_features_t       tap_features;
>       int                     minor;
> +     struct tun_vnet_hash_container __rcu *vnet_hash;
>  
>       void (*update_features)(struct tap_dev *tap, netdev_features_t 
> features);
>       void (*count_tx_dropped)(struct tap_dev *tap);
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 
> bb2b751d274acff931281a72e8b4b0c699b4e8af..cdd793f1c360ad5f63fcc4cbf67d845f5e2ccf6f
>  100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4842,6 +4842,9 @@ enum skb_ext_id {
>  #endif
>  #if IS_ENABLED(CONFIG_MCTP_FLOWS)
>       SKB_EXT_MCTP,
> +#endif
> +#if IS_ENABLED(CONFIG_TUN)
> +     SKB_EXT_TUN_VNET_HASH,
>  #endif
>       SKB_EXT_NUM, /* must be last */
>  };
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 
> 287cdc81c9390c289a30545aa7ed23d81c3329d3..4887f97500a870c7ef3c96a5837b2d0a5a225040
>  100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -62,6 +62,42 @@
>  #define TUNSETCARRIER _IOW('T', 226, int)
>  #define TUNGETDEVNETNS _IO('T', 227)
>  
> +/**
> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
> + *
> + * The argument is a pointer to &struct tun_vnet_hash which will store the
> + * maximal virtio_net hashing configuration.
> + */
> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
> +
> +/**
> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
> + *
> + * The argument is a pointer to &struct tun_vnet_hash.
> + *
> + * The argument is a pointer to the compound of the following in order if
> + * %TUN_VNET_HASH_RSS is set:
> + *
> + * 1. &struct tun_vnet_hash
> + * 2. &struct tun_vnet_hash_rss
> + * 3. Indirection table
> + * 4. Key
> + *
> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or 
> equal
> + * to the size of &struct virtio_net_hdr_v1_hash.
> + *
> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will
> + * always be little-endian.
> + *
> + * This ioctl results in %EBADFD if the underlying device is deleted. It 
> affects
> + * all queues attached to the same device.
> + *
> + * This ioctl currently has no effect on XDP packets and packets with
> + * queue_mapping set by TC.
> + */
> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
> +
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN              0x0001
>  #define IFF_TAP              0x0002
> @@ -115,4 +151,43 @@ struct tun_filter {
>       __u8   addr[][ETH_ALEN];
>  };
>  
> +/**
> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
> + */
> +#define TUN_VNET_HASH_REPORT 0x0001
> +
> +/**
> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> + *
> + * This is mutually exclusive with eBPF steering program.
> + */
> +#define TUN_VNET_HASH_RSS    0x0002
> +
> +/**
> + * struct tun_vnet_hash - virtio_net hashing configuration
> + * @flags:
> + *           Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
> + * @pad:
> + *           Should be filled with zero before passing to %TUNSETVNETHASH
> + * @types:
> + *           Bitmask of allowed hash types
> + */
> +struct tun_vnet_hash {
> +     __u16 flags;
> +     __u8 pad[2];
> +     __u32 types;
> +};
> +
> +/**
> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> + * @indirection_table_mask:
> + *           Bitmask to be applied to the indirection table index
> + * @unclassified_queue:
> + *           The index of the queue to place unclassified packets in
> + */
> +struct tun_vnet_hash_rss {
> +     __u16 indirection_table_mask;
> +     __u16 unclassified_queue;
> +};
> +
>  #endif /* _UAPI__IF_TUN_H */
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 
> 7b03b64fdcb276f68ce881d1d8da8e4c6b897efc..aa2a091b649f0c9d6e0196f34f345ba78b5498fb
>  100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -64,6 +64,7 @@
>  #include <linux/mpls.h>
>  #include <linux/kcov.h>
>  #include <linux/iov_iter.h>
> +#include <linux/virtio_net.h>
>  
>  #include <net/protocol.h>
>  #include <net/dst.h>
> @@ -4969,6 +4970,9 @@ static const u8 skb_ext_type_len[] = {
>  #if IS_ENABLED(CONFIG_MCTP_FLOWS)
>       [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
>  #endif
> +#if IS_ENABLED(CONFIG_TUN)
> +     [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct virtio_net_hash),
> +#endif
>  };
>  
>  static __always_inline unsigned int skb_ext_total_length(void)
> 
> -- 
> 2.48.1
>

Re: [PATCH net-next v9 3/6] tun: Introduce virtio-net hash feature

Reply via email to