On Tue, Dec 5, 2017 at 2:29 AM, Jason Wang <jasow...@redhat.com> wrote: > > > On 2017年12月05日 08:16, Willem de Bruijn wrote: >> >> On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasow...@redhat.com> wrote: >>> >>> This patch introduces an eBPF based queue selection method. With this, >>> the policy could be offloaded to userspace completely through a new >>> ioctl TUNSETSTEERINGEBPF. >>> >>> Signed-off-by: Jason Wang <jasow...@redhat.com> >>> --- >>> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff >>> *skb) >>> +{ >>> + struct tun_steering_prog *prog; >>> + u16 ret = 0; >>> + >>> + prog = rcu_dereference(tun->steering_prog); >>> + if (prog) >>> + ret = bpf_prog_run_clear_cb(prog->prog, skb); >> >> This dereferences tun->steering_prog for a second time. It is safe >> in this load balancing case to assign a few extra packets to queue 0. >> But the issue can also be avoided by replacing the function with a >> direct call in tun_net_xmit: >> >> struct tun_steering_prog *s = rcu_dereference(tun->steering_prog); >> if (s) >> ret = bpf_prog_run_clear_cb(s->prog, skb) % >> tun->numqueues; > > > Right. > > >> >>> /* Net device start xmit */ >>> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device >>> *dev) >>> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) >>> { >>> - struct tun_struct *tun = netdev_priv(dev); >>> - int txq = skb->queue_mapping; >>> - struct tun_file *tfile; >>> - u32 numqueues = 0; >>> - >>> - rcu_read_lock(); >>> - tfile = rcu_dereference(tun->tfiles[txq]); >>> - numqueues = READ_ONCE(tun->numqueues); >>> - >>> - /* Drop packet if interface is not attached */ >>> - if (txq >= numqueues) >>> - goto drop; >>> - >>> #ifdef CONFIG_RPS >>> - if (numqueues == 1 && static_key_false(&rps_needed)) { >>> + if (tun->numqueues == 1 && static_key_false(&rps_needed)) { >>> /* Select queue was not called for the skbuff, so we >>> extract the >>> * RPS hash and save it into the flow_table here. >>> */ >>> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, >>> struct net_device *dev) >>> } >>> } >>> #endif >>> +} >>> + >>> +/* Net device start xmit */ >>> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device >>> *dev) >>> +{ >>> + struct tun_struct *tun = netdev_priv(dev); >>> + int txq = skb->queue_mapping; >>> + struct tun_file *tfile; >>> + u32 numqueues = 0; >>> + >>> + rcu_read_lock(); >>> + tfile = rcu_dereference(tun->tfiles[txq]); >>> + numqueues = READ_ONCE(tun->numqueues); >> >> Now tun->numqueues is read twice, reversing commit fa35864e0bb7 >> ("tuntap: Fix for a race in accessing numqueues"). I don't see anything >> left that would cause a divide by zero after the relevant code was >> converted from divide to multiple and subsequently even removed. >> >> But if it's safe to read multiple times, might as well remove the >> READ_ONCE. > > > Good point, but READ_ONCE() is not something new, we'd better change this in > another patch.
Sounds good. It's a simple follow-up. I can also send that. > > >> >>> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, >>> struct tun_file *tfile, >>> int copylen; >>> bool zerocopy = false; >>> int err; >>> - u32 rxhash; >>> + u32 rxhash = 0; >>> int skb_xdp = 1; >>> bool frags = tun_napi_frags_enabled(tun); >>> >>> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct >>> *tun, struct tun_file *tfile, >>> rcu_read_unlock(); >>> } >>> >>> - rxhash = __skb_get_hash_symmetric(skb); >>> + rcu_read_lock(); >>> + if (!rcu_dereference(tun->steering_prog)) >>> + rxhash = __skb_get_hash_symmetric(skb); >>> + rcu_read_unlock(); >>> >>> if (frags) { >>> /* Exercise flow dissector code path. */ >>> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, >>> struct tun_file *tfile, >>> u64_stats_update_end(&stats->syncp); >>> put_cpu_ptr(stats); >>> >>> - tun_flow_update(tun, rxhash, tfile); >>> + if (rxhash) >>> + tun_flow_update(tun, rxhash, tfile); >>> + >> >> Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is >> the >> safer check. > > > Looks not? E.g looking at __flow_hash_from_keys() it did: > > static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) > { > u32 hash; > > __flow_hash_consistentify(keys); > > hash = __flow_hash_words(flow_keys_hash_start(keys), > flow_keys_hash_length(keys), keyval); > if (!hash) > hash = 1; > > return hash; > } > > Thanks Interesting, thanks. In that case Acked-by: Willem de Bruijn <will...@google.com>