Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
Akihiko Odaki wrote: > RSS is a receive steering algorithm that can be negotiated to use with > virtio_net. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF steering program. > > Introduce the code to perform RSS to the kernel in order to overcome > thse challenges. An alternative solution is to extend the eBPF steering > program so that it will be able to report to the userspace, but I didn't > opt for it because extending the current mechanism of eBPF steering > program as is because it relies on legacy context rewriting, and > introducing kfunc-based eBPF will result in non-UAPI dependency while > the other relevant virtualization APIs such as KVM and vhost_net are > UAPIs. > > Signed-off-by: Akihiko Odaki > --- > drivers/net/tun.c | 119 > +++- > include/uapi/linux/if_tun.h | 27 ++ > 2 files changed, 133 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index b8fcd71becac..5a429b391144 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -175,6 +175,9 @@ struct tun_prog { > > struct tun_vnet_hash_container { > struct tun_vnet_hash common; > + struct tun_vnet_hash_rss rss; > + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + u16 rss_indirection_table[]; > }; > > /* Since the socket were moved to tun_file, to preserve the behavior of > persist > @@ -227,7 +230,7 @@ struct veth { > }; > > static const struct tun_vnet_hash tun_vnet_hash_cap = { > - .flags = TUN_VNET_HASH_REPORT, > + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS, > .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > }; > > @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, > struct sk_buff *skb) > return ret % numqueues; > } > > +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun, > + struct sk_buff *skb, > + const struct tun_vnet_hash_container > *vnet_hash) > +{ > + struct tun_vnet_hash_ext *ext; > + struct virtio_net_hash hash; > + u32 numqueues = READ_ONCE(tun->numqueues); > + u16 txq, index; > + > + if (!numqueues) > + return 0; > + > + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, > vnet_hash->rss_key, > + &hash)) > + return vnet_hash->rss.unclassified_queue % numqueues; > + > + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) { > + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH); > + if (ext) { > + ext->value = hash.value; > + ext->report = hash.report; > + } > + } > + > + index = hash.value & vnet_hash->rss.indirection_table_mask; > + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]); > + > + return txq % numqueues; > +} > + > static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, > struct net_device *sb_dev) > { > @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, > struct sk_buff *skb, > } else { > struct tun_vnet_hash_container *vnet_hash = > rcu_dereference(tun->vnet_hash); > > - ret = tun_automq_select_queue(tun, skb, vnet_hash); > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) > + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash); > + else > + ret = tun_automq_select_queue(tun, skb, vnet_hash); > } > rcu_read_unlock(); > > @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct > ifreq *ifr) > } > > static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu > **prog_p, > - void __user *data) > + int fd) > { > struct bpf_prog *prog; > - int fd; > - > - if (copy_from_user(&fd, data, sizeof(fd))) > - return -EFAULT; > > if (fd == -1) { > prog = NULL; > @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned > int cmd, > int ifindex; > int sndbuf; > int vnet_hdr_sz; > + int fd; > int le; > int ret; > bool do_notify = false; > @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, > unsigned int cmd, > break; > > case TUNSETSTEERINGEBPF: > - ret = tun_set_ebpf(tun, &tun->steering_prog, argp); > + if (get_user(fd, (int __user *)argp)) { > + ret = -EFAULT; > + break; > + } > + > + vnet_hash = rtnl_dereference(tun-
Re: [PATCH v2 07/25] KVM: VMX: Set intercept for FRED MSRs
MSR_IA32_FRED_SSP0 is an alias of the CET MSR_IA32_PL0_SSP and likely to be used in the same way as FRED RSP0, i.e., host FRED SSP0 _should_ be restored in arch_exit_to_user_mode_prepare(). However as of today Linux has no plan to utilize kernel shadow stack thus no one cares host FRED SSP0 (no?). But lets say anyway it is host's responsibility to manage host FRED SSP0, then KVM only needs to take care of guest FRED SSP0 (just like how KVM should handle guest FRED RSP0) even before the supervisor shadow stack feature is advertised to guest. Heh, I'm not sure what your question is, or if there even is a question. KVM needs to context switch FRED SSP0 if FRED is exposed to the guest, but presumably that will be done through XSAVE state? If that's the long term plan, I would prefer to focus on merging CET virtualization first, and then land FRED virtualization on top so that KVM doesn't have to carry intermediate code to deal with the aliased MSR. You mean the following patch set, right? https://lore.kernel.org/kvm/20240531090331.13713-1-weijiang.y...@intel.com/ I think it's probably better to do so. Otherwise a patch to explicitly save/restore FRED SSP0 would be needed before the CET patch set lands and then reverted immediately after: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f828f03d48ab..c790cb7a7d6b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1343,8 +1343,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - if (cpu_feature_enabled(X86_FEATURE_FRED) && guest_can_use(vcpu, X86_FEATURE_FRED)) + if (cpu_feature_enabled(X86_FEATURE_FRED) && guest_can_use(vcpu, X86_FEATURE_FRED)) { wrmsrns(MSR_IA32_FRED_RSP0, vmx->msr_guest_fred_rsp0); + wrmsrns(MSR_IA32_FRED_SSP0, vmx->msr_guest_fred_ssp0); + } #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1393,6 +1395,8 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) if (cpu_feature_enabled(X86_FEATURE_FRED) && guest_can_use(&vmx->vcpu, X86_FEATURE_FRED)) { vmx->msr_guest_fred_rsp0 = read_msr(MSR_IA32_FRED_RSP0); fred_sync_rsp0(vmx->msr_guest_fred_rsp0); + + vmx->msr_guest_fred_ssp0 = read_msr(MSR_IA32_FRED_SSP0); } #endif load_fixmap_gdt(raw_smp_processor_id()); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9012428984de..d1b9352f56c7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -282,6 +282,7 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; u64 msr_guest_fred_rsp0; + u64 msr_guest_fred_ssp0; #endif u64 spec_ctrl; Ugh, but what happens if a CPU (or the host kernel) supports FRED but not CET SS? Or is that effectively an illegal combination? The FRED Spec says: IA32_FRED_SSP1, IA32_FRED_SSP2, and IA32_FRED_SSP3 (MSR indices 1D1H– 1D3H). Together with the existing MSR IA32_PL0_SSP (MSR index 6A4H), these are the FRED SSP MSRs. The FRED SSP MSRs are supported by any processor that enumerates CPUID.(EAX=7,ECX=1):EAX.FRED[bit 17] as 1. If such a processor does not support CET, FRED transitions will not use the MSRs (because shadow stacks are not enabled), but the MSRs would still be accessible using RDMSR and WRMSR. So they are independent, just that FRED SSP MSRs are NOT used if supervisor shadow stacks are not enabled (obviously Qemu can be configured to not advertise CET but FRED). When FRED is advertised to a guest, KVM should allow FRED SSP MSRs accesses through disabling FRED SSP MSRs interception no matter whether supervisor shadow stacks are enabled or not. Thanks! Xin
Re: [PATCH RFC v3 2/9] virtio_net: Add functions for hashing
Akihiko Odaki wrote: > They are useful to implement VIRTIO_NET_F_RSS and > VIRTIO_NET_F_HASH_REPORT. > > Signed-off-by: Akihiko Odaki > --- > include/linux/virtio_net.h | 198 > + > 1 file changed, 198 insertions(+) > > diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h > index 6c395a2600e8..7ee2e2f2625a 100644 > --- a/include/linux/virtio_net.h > +++ b/include/linux/virtio_net.h > @@ -9,6 +9,183 @@ > #include > #include > > +struct virtio_net_hash { > + u32 value; > + u16 report; > +}; > + > +struct virtio_net_toeplitz_state { > + u32 hash; > + u32 key_buffer; > + const __be32 *key; > +}; > + > +#define VIRTIO_NET_SUPPORTED_HASH_TYPES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \ > + VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \ > + VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \ > + VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \ > + VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \ > + VIRTIO_NET_RSS_HASH_TYPE_UDPv6) > + > +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 > + > +static inline void virtio_net_toeplitz(struct virtio_net_toeplitz_state > *state, > +const __be32 *input, size_t len) > +{ > + u32 key; > + > + while (len) { > + state->key++; > + key = be32_to_cpu(*state->key); > + > + for (u32 bit = BIT(31); bit; bit >>= 1) { > + if (be32_to_cpu(*input) & bit) > + state->hash ^= state->key_buffer; > + > + state->key_buffer = > + (state->key_buffer << 1) | !!(key & bit); > + } > + > + input++; > + len--; > + } > +} > + > +static inline u8 virtio_net_hash_key_length(u32 types) > +{ > + size_t len = 0; > + > + if (types & VIRTIO_NET_HASH_REPORT_IPv4) > + len = max(len, > + sizeof(struct flow_dissector_key_ipv4_addrs)); > + > + if (types & > + (VIRTIO_NET_HASH_REPORT_TCPv4 | VIRTIO_NET_HASH_REPORT_UDPv4)) > + len = max(len, > + sizeof(struct flow_dissector_key_ipv4_addrs) + > + sizeof(struct flow_dissector_key_ports)); > + > + if (types & VIRTIO_NET_HASH_REPORT_IPv6) > + len = max(len, > + sizeof(struct flow_dissector_key_ipv6_addrs)); > + > + if (types & > + (VIRTIO_NET_HASH_REPORT_TCPv6 | VIRTIO_NET_HASH_REPORT_UDPv6)) > + len = max(len, > + sizeof(struct flow_dissector_key_ipv6_addrs) + > + sizeof(struct flow_dissector_key_ports)); > + > + return 4 + len; Avoid raw constants like this 4. What field does it capture? Instead of working from shortest to longest and using max, how about the inverse and return as soon as a match is found. > +} > + > +static inline u32 virtio_net_hash_report(u32 types, > + struct flow_dissector_key_basic key) > +{ > + switch (key.n_proto) { > + case htons(ETH_P_IP): > + if (key.ip_proto == IPPROTO_TCP && > + (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) > + return VIRTIO_NET_HASH_REPORT_TCPv4; > + > + if (key.ip_proto == IPPROTO_UDP && > + (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) > + return VIRTIO_NET_HASH_REPORT_UDPv4; > + > + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) > + return VIRTIO_NET_HASH_REPORT_IPv4; > + > + return VIRTIO_NET_HASH_REPORT_NONE; > + > + case htons(ETH_P_IPV6): > + if (key.ip_proto == IPPROTO_TCP && > + (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6)) > + return VIRTIO_NET_HASH_REPORT_TCPv6; > + > + if (key.ip_proto == IPPROTO_UDP && > + (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6)) > + return VIRTIO_NET_HASH_REPORT_UDPv6; > + > + if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) > + return VIRTIO_NET_HASH_REPORT_IPv6; > + > + return VIRTIO_NET_HASH_REPORT_NONE; > + > + default: > + return VIRTIO_NET_HASH_REPORT_NONE; > + } > +} > + > +static inline bool virtio_net_hash_rss(const struct sk_buff *skb, > +u32 types, const __be32 *key, > +struct virtio_net_hash *hash) > +{ > + u16 report; nit: move below the struct declarations. > + struct virtio_net_toeplitz_state toeplitz_state = { > + .key_buffer = be32_to_cpu(*key), > + .key = key > + }; > + struct flow_keys flow; > + > + if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) > + return false; > + > + report = virti
Re: [PATCH RFC v3 4/9] tap: Pad virtio header with zero
Akihiko Odaki wrote: > tap used to simply advance iov_iter when it needs to pad virtio header. > This leaves the garbage in the buffer as is and prevents telling if the > header is padded or contains some real data. > > In theory, a user of tap can fill the buffer with zero before calling > read() to avoid such a problem, but leaving the garbage in the buffer is > awkward anyway so fill the buffer in tap. This description does not describe the need for this operation. The new extension seemingly requires these bytes to be cleared? Please make that explicit. > Signed-off-by: Akihiko Odaki > --- > drivers/net/tap.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/tap.c b/drivers/net/tap.c > index 77574f7a3bd4..ba044302ccc6 100644 > --- a/drivers/net/tap.c > +++ b/drivers/net/tap.c > @@ -813,7 +813,7 @@ static ssize_t tap_put_user(struct tap_queue *q, > sizeof(vnet_hdr)) > return -EFAULT; > > - iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr)); > + iov_iter_zero(vnet_hdr_len - sizeof(vnet_hdr), iter); > } > total = vnet_hdr_len; > total += skb->len; > > -- > 2.46.0 >
Re: [PATCH RFC v3 1/9] skbuff: Introduce SKB_EXT_TUN_VNET_HASH
Akihiko Odaki wrote: > This new extension will be used by tun to carry the hash values and > types to report with virtio-net headers. > > Signed-off-by: Akihiko Odaki > --- > include/linux/skbuff.h | 10 ++ > net/core/skbuff.c | 3 +++ > 2 files changed, 13 insertions(+) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 29c3ea5b6e93..17cee21c 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -334,6 +334,13 @@ struct tc_skb_ext { > }; > #endif > > +#if IS_ENABLED(CONFIG_TUN) > +struct tun_vnet_hash_ext { > + u32 value; > + u16 report; > +}; > +#endif This is unlikely to belong in skbuff.h > + > struct sk_buff_head { > /* These two members must be first to match sk_buff. */ > struct_group_tagged(sk_buff_list, list, > @@ -4718,6 +4725,9 @@ enum skb_ext_id { > #endif > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > SKB_EXT_MCTP, > +#endif > +#if IS_ENABLED(CONFIG_TUN) > + SKB_EXT_TUN_VNET_HASH, > #endif > SKB_EXT_NUM, /* must be last */ > }; > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 83f8cd8aa2d1..ce34523fd8de 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -4979,6 +4979,9 @@ static const u8 skb_ext_type_len[] = { > #if IS_ENABLED(CONFIG_MCTP_FLOWS) > [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), > #endif > +#if IS_ENABLED(CONFIG_TUN) > + [SKB_EXT_TUN_VNET_HASH] = SKB_EXT_CHUNKSIZEOF(struct tun_vnet_hash_ext), > +#endif > }; > > static __always_inline unsigned int skb_ext_total_length(void) > > -- > 2.46.0 >
Re: [PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting feature
Akihiko Odaki wrote: > Allow the guest to reuse the hash value to make receive steering > consistent between the host and guest, and to save hash computation. > > Signed-off-by: Akihiko Odaki > --- > Documentation/networking/tuntap.rst | 7 ++ > drivers/net/Kconfig | 1 + > drivers/net/tun.c | 146 > +++- > include/uapi/linux/if_tun.h | 44 +++ > 4 files changed, 180 insertions(+), 18 deletions(-) > > diff --git a/Documentation/networking/tuntap.rst > b/Documentation/networking/tuntap.rst > index 4d7087f727be..86b4ae8caa8a 100644 > --- a/Documentation/networking/tuntap.rst > +++ b/Documentation/networking/tuntap.rst > @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it:: >return ioctl(fd, TUNSETQUEUE, (void *)&ifr); >} > > +3.4 Reference > +- > + > +``linux/if_tun.h`` defines the interface described below: > + > +.. kernel-doc:: include/uapi/linux/if_tun.h > + > Universal TUN/TAP device driver Frequently Asked Question > = > > diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig > index 9920b3a68ed1..e2a7bd703550 100644 > --- a/drivers/net/Kconfig > +++ b/drivers/net/Kconfig > @@ -395,6 +395,7 @@ config TUN > tristate "Universal TUN/TAP device driver support" > depends on INET > select CRC32 > + select SKB_EXTENSIONS > help > TUN/TAP provides packet reception and transmission for user space > programs. It can be viewed as a simple Point-to-Point or Ethernet > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 9d93ab9ee58f..b8fcd71becac 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c > @@ -173,6 +173,10 @@ struct tun_prog { > struct bpf_prog *prog; > }; > > +struct tun_vnet_hash_container { > + struct tun_vnet_hash common; > +}; > + > /* Since the socket were moved to tun_file, to preserve the behavior of > persist > * device, socket filter, sndbuf and vnet header size were restore when the > * file were attached to a persist device. > @@ -210,6 +214,7 @@ struct tun_struct { > struct bpf_prog __rcu *xdp_prog; > struct tun_prog __rcu *steering_prog; > struct tun_prog __rcu *filter_prog; > + struct tun_vnet_hash_container __rcu *vnet_hash; This is just +struct tun_vnet_hash { + u32 value; + u16 report; +}; Can just be fields in the struct directly. Also, only one bit really used for report, so probably can be condensed further. > struct ethtool_link_ksettings link_ksettings; > /* init args */ > struct file *file; > @@ -221,6 +226,11 @@ struct veth { > __be16 h_vlan_TCI; > }; > > +static const struct tun_vnet_hash tun_vnet_hash_cap = { > + .flags = TUN_VNET_HASH_REPORT, > + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES > +}; > + > static void tun_flow_init(struct tun_struct *tun); > static void tun_flow_uninit(struct tun_struct *tun); > > @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int > __user *argp) > if (get_user(be, argp)) > return -EFAULT; > > - if (be) > + if (be) { > + struct tun_vnet_hash_container *vnet_hash = > rtnl_dereference(tun->vnet_hash); > + > + if (!(tun->flags & TUN_VNET_LE) && > + vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT)) > + return -EBUSY; > + Doesn't be here imply !tun->flags & TUN_VNET_LE? Same again below. > tun->flags |= TUN_VNET_BE; > - else > + } else { > tun->flags &= ~TUN_VNET_BE; > + } > > return 0; > } > @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct > tun_flow_entry *e, u32 hash) > * the userspace application move between processors, we may get a > * different rxq no. here. > */ > -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff > *skb) > +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff > *skb, > +const struct tun_vnet_hash_container > *vnet_hash) > { > + struct tun_vnet_hash_ext *ext; > + struct flow_keys keys; > struct tun_flow_entry *e; > u32 txq, numqueues; > > numqueues = READ_ONCE(tun->numqueues); > > - txq = __skb_get_hash_symmetric(skb); > + memset(&keys, 0, sizeof(keys)); > + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0); > + > + txq = flow_hash_from_keys(&keys); > e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); > if (e) { > tun_flow_save_rps_rxhash(e, txq); > @@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct > *tun, struct sk_buff *skb) > txq = reciprocal_scale(txq, numqueues); > } > > + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) { > +