2019-08-29, 09:04:31 +0200, Steffen Klassert wrote: > On Wed, Aug 21, 2019 at 11:46:25PM +0200, Sabrina Dubroca wrote: > > +static struct sock *esp_find_tcp_sk(struct xfrm_state *x) > > +{ > > + struct xfrm_encap_tmpl *encap = x->encap; > > + struct esp_tcp_sk *esk; > > + __be16 sport, dport; > > + struct sock *nsk; > > + struct sock *sk; > > + > > + sk = rcu_dereference(x->encap_sk); > > + if (sk && sk->sk_state == TCP_ESTABLISHED) > > + return sk; > > + > > + spin_lock_bh(&x->lock); > > + sport = encap->encap_sport; > > + dport = encap->encap_dport; > > + nsk = rcu_dereference_protected(x->encap_sk, > > + lockdep_is_held(&x->lock)); > > + if (sk && sk == nsk) { > > + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); > > + if (!esk) { > > + spin_unlock_bh(&x->lock); > > + return ERR_PTR(-ENOMEM); > > + } > > + RCU_INIT_POINTER(x->encap_sk, NULL); > > + esk->sk = sk; > > + call_rcu(&esk->rcu, esp_free_tcp_sk); > > I don't understand this, can you please explain what you are doing here?
If we get to this block, the current encap_sk is not in ESTABLISHED state and cannot be used to send data. We want to get rid of it (reset x->encap_sk) and find a new usable socket. The weird kmalloc dance is just here so that we can do the sock_put after an RCU grace period, which I think is needed so that we don't destruct a socket that's still used by packets in flight. That's code I copied from Herbert's first submission, I may be missing some details here. > > + } > > + spin_unlock_bh(&x->lock); > > + > > + sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, > > + dport, x->props.saddr.a4, sport, 0); > > + if (!sk) > > + return ERR_PTR(-ENOENT); > > + > > + if (!tcp_is_ulp_esp(sk)) { > > + sock_put(sk); > > + return ERR_PTR(-EINVAL); > > + } > > + > > + spin_lock_bh(&x->lock); > > + nsk = rcu_dereference_protected(x->encap_sk, > > + lockdep_is_held(&x->lock)); > > + if (encap->encap_sport != sport || > > + encap->encap_dport != dport) { > > + sock_put(sk); > > + sk = nsk ?: ERR_PTR(-EREMCHG); > > + } else if (sk == nsk) { > > + sock_put(sk); > > + } else { > > + rcu_assign_pointer(x->encap_sk, sk); > > + } > > + spin_unlock_bh(&x->lock); > > + > > + return sk; > > +} > > + > > +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) > > +{ > > + struct sock *sk; > > + int err; > > + > > + rcu_read_lock(); > > + > > + sk = esp_find_tcp_sk(x); > > + err = PTR_ERR(sk); > > + if (IS_ERR(sk)) > > Maybe better 'if (err)'? That will work if I replace PTR_ERR with PTR_ERR_OR_ZERO. > > + goto out; > > + > > + bh_lock_sock(sk); > > + if (sock_owned_by_user(sk)) { > > + err = espintcp_queue_out(sk, skb); > > + if (err < 0) > > + goto unlock_sock; > > This goto is not needed. Will fix. > > + } else { > > + err = espintcp_push_skb(sk, skb); > > + } > > + > > +unlock_sock: > > + bh_unlock_sock(sk); > > +out: > > + rcu_read_unlock(); > > + return err; > > +} > > + > > +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, > > + struct sk_buff *skb) > > +{ > > + struct dst_entry *dst = skb_dst(skb); > > + struct xfrm_state *x = dst->xfrm; > > + > > + return esp_output_tcp_finish(x, skb); > > +} > > + > > +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) > > +{ > > + int err; > > + > > + local_bh_disable(); > > + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); > > + local_bh_enable(); > > + > > + /* EINPROGRESS just happens to do the right thing. It > > + * actually means that the skb has been consumed and > > + * isn't coming back. > > + */ > > + return err ?: -EINPROGRESS; > > +} > > +#endif > > + > > static void esp_output_done(struct crypto_async_request *base, int err) > > { > > struct sk_buff *skb = base->data; > > @@ -147,7 +272,13 @@ static void esp_output_done(struct > > crypto_async_request *base, int err) > > secpath_reset(skb); > > xfrm_dev_resume(skb); > > } else { > > - xfrm_output_resume(skb, err); > > +#ifdef CONFIG_XFRM_ESPINTCP > > Do we really need all these ifdef? I guess most of them could > be avoided with some code refactorization. If we compile espintcp out, esp_output_tail_tcp will be dead code so it might as well not be compiled in either. The more trivial operations (like in esp_input_done2) could be compiled in anyway. That might be considered a bit inconsistent, because I need to keep the ifdef in esp_init_state (so that we land in the default case and error out when espintcp is disabled). Then I can provide variants of some functions (esp_output_tcp_encap, esp_output_tail_tcp) that always fail for !XFRM_ESPINTCP. > > + if (!err && > > + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) > > + esp_output_tail_tcp(x, skb); > > + else > > +#endif > > + xfrm_output_resume(skb, err); > > } > > } > > ... > > > @@ -296,7 +460,7 @@ int esp_output_head(struct xfrm_state *x, struct > > sk_buff *skb, struct esp_info * > > struct sk_buff *trailer; > > int tailen = esp->tailen; > > > > - /* this is non-NULL only with UDP Encapsulation */ > > + /* this is non-NULL only with TCP/UDP Encapsulation */ > > if (x->encap) { > > int err = esp_output_encap(x, skb, esp); > > > > @@ -491,6 +655,11 @@ int esp_output_tail(struct xfrm_state *x, struct > > sk_buff *skb, struct esp_info * > > if (sg != dsg) > > esp_ssg_unref(x, tmp); > > > > +#ifdef CONFIG_XFRM_ESPINTCP > > + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) > > + err = esp_output_tail_tcp(x, skb); > > +#endif > > + > > error_free: > > kfree(tmp); > > error: > > @@ -617,10 +786,16 @@ int esp_input_done2(struct sk_buff *skb, int err) > > > > if (x->encap) { > > struct xfrm_encap_tmpl *encap = x->encap; > > + struct tcphdr *th = (void *)(skb_network_header(skb) + ihl); > > This gives a unused variable warning if CONFIG_XFRM_ESPINTCP > is not set. Oops, will fix. Or that will take care of itself if I just remove the ifdef below. > > struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); > > __be16 source; > > > > switch (x->encap->encap_type) { > > +#ifdef CONFIG_XFRM_ESPINTCP (this one) > > + case TCP_ENCAP_ESPINTCP: > > + source = th->source; > > + break; > > +#endif > > case UDP_ENCAP_ESPINUDP: > > case UDP_ENCAP_ESPINUDP_NON_IKE: > > source = uh->source; > > @@ -1017,6 +1192,14 @@ static int esp_init_state(struct xfrm_state *x) > > case UDP_ENCAP_ESPINUDP_NON_IKE: > > x->props.header_len += sizeof(struct udphdr) + 2 * > > sizeof(u32); > > break; > > +#ifdef CONFIG_XFRM_ESPINTCP > > + case TCP_ENCAP_ESPINTCP: > > + /* only the length field, TCP encap is done by > > + * the socket > > + */ > > + x->props.header_len += 2; > > + break; > > +#endif > > } > > } > > > > diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig > > index c967fc3c38c8..ccc012b3ea10 100644 > > --- a/net/xfrm/Kconfig > > +++ b/net/xfrm/Kconfig > > @@ -71,6 +71,15 @@ config XFRM_IPCOMP > > select CRYPTO > > select CRYPTO_DEFLATE > > > > +config XFRM_ESPINTCP > > + bool "ESP in TCP encapsulation (RFC 8229)" > > + depends on XFRM && INET_ESP > > + select STREAM_PARSER > > I'm getting these compile errors: > > espintcp.o: In function `espintcp_close': > /home/klassert/git/linux-stk/net/xfrm/espintcp.c:469: undefined reference to > `sk_msg_free' > net/xfrm/espintcp.o: In function `espintcp_sendmsg': > /home/klassert/git/linux-stk/net/xfrm/espintcp.c:302: undefined reference to > `sk_msg_alloc' > /home/klassert/git/linux-stk/net/xfrm/espintcp.c:316: undefined reference to > `sk_msg_memcopy_from_iter' > /home/klassert/git/linux-stk/net/xfrm/espintcp.c:341: undefined reference to > `sk_msg_free' > /home/klassert/git/linux-stk/net/xfrm/espintcp.c:321: undefined reference to > `sk_msg_memcopy_from_iter' > /home/klassert/git/linux-stk/Makefile:1067: recipe for target 'vmlinux' failed > make[1]: *** [vmlinux] Error 1 > > I guess you need to select NET_SOCK_MSG. Right, I'll go fix the Kconfig entry. > Btw. I've updated the ipsec-next tree, so patch 1 is not needed anymore. Cool, I'll drop it. > Everything else looks good! Thanks for the review. -- Sabrina