Akihiko Odaki wrote:
> RSS is a receive steering algorithm that can be negotiated to use with
> virtio_net. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
> 
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF steering program.
> 
> Introduce the code to perform RSS to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but I didn't
> opt for it because extending the current mechanism of eBPF steering
> program as is because it relies on legacy context rewriting, and
> introducing kfunc-based eBPF will result in non-UAPI dependency while
> the other relevant virtualization APIs such as KVM and vhost_net are
> UAPIs.
> 
> Signed-off-by: Akihiko Odaki <akihiko.od...@daynix.com>
> ---
>  drivers/net/tun.c           | 119 
> +++++++++++++++++++++++++++++++++++++++-----
>  include/uapi/linux/if_tun.h |  27 ++++++++++
>  2 files changed, 133 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b8fcd71becac..5a429b391144 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,6 +175,9 @@ struct tun_prog {
>  
>  struct tun_vnet_hash_container {
>       struct tun_vnet_hash common;
> +     struct tun_vnet_hash_rss rss;
> +     __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> +     u16 rss_indirection_table[];
>  };
>  
>  /* Since the socket were moved to tun_file, to preserve the behavior of 
> persist
> @@ -227,7 +230,7 @@ struct veth {
>  };
>  
>  static const struct tun_vnet_hash tun_vnet_hash_cap = {
> -     .flags = TUN_VNET_HASH_REPORT,
> +     .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>       .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>  };
>  
> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, 
> struct sk_buff *skb)
>       return ret % numqueues;
>  }
>  
> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
> +                                  struct sk_buff *skb,
> +                                  const struct tun_vnet_hash_container 
> *vnet_hash)
> +{
> +     struct tun_vnet_hash_ext *ext;
> +     struct virtio_net_hash hash;
> +     u32 numqueues = READ_ONCE(tun->numqueues);
> +     u16 txq, index;
> +
> +     if (!numqueues)
> +             return 0;
> +
> +     if (!virtio_net_hash_rss(skb, vnet_hash->common.types, 
> vnet_hash->rss_key,
> +                              &hash))
> +             return vnet_hash->rss.unclassified_queue % numqueues;
> +
> +     if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
> +             ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> +             if (ext) {
> +                     ext->value = hash.value;
> +                     ext->report = hash.report;
> +             }
> +     }
> +
> +     index = hash.value & vnet_hash->rss.indirection_table_mask;
> +     txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
> +
> +     return txq % numqueues;
> +}
> +
>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>                           struct net_device *sb_dev)
>  {
> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, 
> struct sk_buff *skb,
>       } else {
>               struct tun_vnet_hash_container *vnet_hash = 
> rcu_dereference(tun->vnet_hash);
>  
> -             ret = tun_automq_select_queue(tun, skb, vnet_hash);
> +             if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> +                     ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
> +             else
> +                     ret = tun_automq_select_queue(tun, skb, vnet_hash);
>       }
>       rcu_read_unlock();
>  
> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct 
> ifreq *ifr)
>  }
>  
>  static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu 
> **prog_p,
> -                     void __user *data)
> +                     int fd)
>  {
>       struct bpf_prog *prog;
> -     int fd;
> -
> -     if (copy_from_user(&fd, data, sizeof(fd)))
> -             return -EFAULT;
>  
>       if (fd == -1) {
>               prog = NULL;
> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned 
> int cmd,
>       int ifindex;
>       int sndbuf;
>       int vnet_hdr_sz;
> +     int fd;
>       int le;
>       int ret;
>       bool do_notify = false;
> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>               break;
>  
>       case TUNSETSTEERINGEBPF:
> -             ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +             if (get_user(fd, (int __user *)argp)) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +
> +             vnet_hash = rtnl_dereference(tun->vnet_hash);
> +             if (fd != -1 && vnet_hash && (vnet_hash->common.flags & 
> TUN_VNET_HASH_RSS)) {
> +                     ret = -EBUSY;
> +                     break;
> +             }
> +
> +             ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>               break;
>  
>       case TUNSETFILTEREBPF:
> -             ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +             if (get_user(fd, (int __user *)argp)) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +
> +             ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>               break;
>  
>       case TUNSETCARRIER:
> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, 
> unsigned int cmd,
>                       break;
>               }
>  
> -             vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
> -             if (!vnet_hash) {
> -                     ret = -ENOMEM;
> -                     break;
> +             if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
> +                     struct tun_vnet_hash_rss rss;
> +                     size_t indirection_table_size;
> +                     size_t key_size;
> +                     size_t size;
> +
> +                     if (tun->steering_prog) {
> +                             ret = -EBUSY;
> +                             break;
> +                     }
> +
> +                     if (copy_from_user(&rss, argp, sizeof(rss))) {
> +                             ret = -EFAULT;
> +                             break;
> +                     }
> +                     argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> +
> +                     indirection_table_size = 
> ((size_t)rss.indirection_table_mask + 1) * 2;

Why make uapi a mask rather than a length?

Also is there a upper length bounds sanity check for this input from
userspace?

> +                     key_size = 
> virtio_net_hash_key_length(vnet_hash_common.types);
> +                     size = sizeof(*vnet_hash) + indirection_table_size + 
> key_size;

key_size is included in sizeof(*vnet_hash), always
VIRTIO_NET_RSS_MAX_KEY_SIZE.
> +
> +                     vnet_hash = kmalloc(size, GFP_KERNEL);
> +                     if (!vnet_hash) {
> +                             ret = -ENOMEM;
> +                             break;
> +                     }
> +
> +                     if (copy_from_user(vnet_hash->rss_indirection_table,
> +                                        argp, indirection_table_size)) {
> +                             kfree(vnet_hash);
> +                             ret = -EFAULT;
> +                             break;
> +                     }
> +                     argp = (u16 __user *)argp + rss.indirection_table_mask 
> + 1;
> +
> +                     if (copy_from_user(vnet_hash->rss_key, argp, key_size)) 
> {
> +                             kfree(vnet_hash);
> +                             ret = -EFAULT;
> +                             break;
> +                     }
> +
> +                     vnet_hash->rss = rss;
> +             } else {
> +                     vnet_hash = kmalloc(sizeof(vnet_hash->common), 
> GFP_KERNEL);
> +                     if (!vnet_hash) {
> +                             ret = -ENOMEM;
> +                             break;
> +                     }
>               }
>  
>               vnet_hash->common = vnet_hash_common;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 1561e8ce0a0a..1c130409db5d 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -75,6 +75,14 @@
>   *
>   * The argument is a pointer to &struct tun_vnet_hash.
>   *
> + * The argument is a pointer to the compound of the following in order if
> + * %TUN_VNET_HASH_RSS is set:
> + *
> + * 1. &struct tun_vnet_hash
> + * 2. &struct tun_vnet_hash_rss
> + * 3. Indirection table
> + * 4. Key
> + *
>   * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal 
> to
>   * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
>   * %TUN_VNET_HASH_REPORT.
> @@ -144,6 +152,13 @@ struct tun_filter {
>   */
>  #define TUN_VNET_HASH_REPORT 0x0001
>  
> +/**
> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> + *
> + * This is mutually exclusive with eBPF steering program.
> + */
> +#define TUN_VNET_HASH_RSS    0x0002
> +
>  /**
>   * struct tun_vnet_hash - virtio_net hashing configuration
>   * @flags:
> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>       __u32 types;
>  };
>  
> +/**
> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> + * @indirection_table_mask:
> + *           Bitmask to be applied to the indirection table index
> + * @unclassified_queue:
> + *           The index of the queue to place unclassified packets in
> + */
> +struct tun_vnet_hash_rss {
> +     __u16 indirection_table_mask;
> +     __u16 unclassified_queue;
> +};
> +
>  #endif /* _UAPI__IF_TUN_H */
> 
> -- 
> 2.46.0
> 



Reply via email to