John Heffner <[EMAIL PROTECTED]> writes: > Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER. This option forces > us not to fragment, but does not make use of the kernel path MTU discovery. > That is, it allows for user-mode MTU probing (or, packetization-layer path > MTU discovery). This is particularly useful for diagnostic utilities, like > traceroute/tracepath.
You should probably send a manpages update to the manpages maintainer too (cc'ed with fullquote) -Andi > > Signed-off-by: John Heffner <[EMAIL PROTECTED]> > --- > include/linux/in.h | 1 + > include/linux/in6.h | 1 + > include/linux/skbuff.h | 3 ++- > include/net/ip.h | 2 +- > net/core/skbuff.c | 2 ++ > net/ipv4/ip_output.c | 14 ++++++++++---- > net/ipv4/ip_sockglue.c | 2 +- > net/ipv4/raw.c | 3 +++ > net/ipv6/ip6_output.c | 12 ++++++++---- > net/ipv6/ipv6_sockglue.c | 2 +- > net/ipv6/raw.c | 3 +++ > 11 files changed, 33 insertions(+), 12 deletions(-) > > diff --git a/include/linux/in.h b/include/linux/in.h > index 1912e7c..2dc1f8a 100644 > --- a/include/linux/in.h > +++ b/include/linux/in.h > @@ -83,6 +83,7 @@ struct in_addr { > #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ > #define IP_PMTUDISC_WANT 1 /* Use per route hints */ > #define IP_PMTUDISC_DO 2 /* Always DF > */ > +#define IP_PMTUDISC_PROBE 3 /* Ignore dst pmtu */ > > #define IP_MULTICAST_IF 32 > #define IP_MULTICAST_TTL 33 > diff --git a/include/linux/in6.h b/include/linux/in6.h > index 4e8350a..d559fac 100644 > --- a/include/linux/in6.h > +++ b/include/linux/in6.h > @@ -179,6 +179,7 @@ struct in6_flowlabel_req > #define IPV6_PMTUDISC_DONT 0 > #define IPV6_PMTUDISC_WANT 1 > #define IPV6_PMTUDISC_DO 2 > +#define IPV6_PMTUDISC_PROBE 3 > > /* Flowlabel */ > #define IPV6_FLOWLABEL_MGR 32 > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 4ff3940..64038b4 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -284,7 +284,8 @@ struct sk_buff { > nfctinfo:3; > __u8 pkt_type:3, > fclone:2, > - ipvs_property:1; > + ipvs_property:1, > + ign_dst_mtu; > __be16 protocol; > > void (*destructor)(struct sk_buff *skb); > diff --git a/include/net/ip.h b/include/net/ip.h > index e79c3e3..f5874a3 100644 > --- a/include/net/ip.h > +++ b/include/net/ip.h > @@ -201,7 +201,7 @@ int ip_decrease_ttl(struct iphdr *iph) > static inline > int ip_dont_fragment(struct sock *sk, struct dst_entry *dst) > { > - return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO || > + return (inet_sk(sk)->pmtudisc >= IP_PMTUDISC_DO || > (inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT && > !(dst_metric(dst, RTAX_LOCK)&(1<<RTAX_MTU)))); > } > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 702fa8f..5c8515c 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -474,6 +474,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t > gfp_mask) > #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) > C(ipvs_property); > #endif > + C(ign_dst_mtu); > C(protocol); > n->destructor = NULL; > C(mark); > @@ -549,6 +550,7 @@ static void copy_skb_header(struct sk_buff *new, const > struct sk_buff *old) > #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) > new->ipvs_property = old->ipvs_property; > #endif > + new->ign_dst_mtu = old->ign_dst_mtu; > #ifdef CONFIG_BRIDGE_NETFILTER > new->nf_bridge = old->nf_bridge; > nf_bridge_get(old->nf_bridge); > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 90bdd53..a7e8944 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -201,7 +201,8 @@ static inline int ip_finish_output(struct sk_buff *skb) > return dst_output(skb); > } > #endif > - if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) > + if (skb->len > dst_mtu(skb->dst) && > + !skb->ign_dst_mtu && !skb_is_gso(skb)) > return ip_fragment(skb, ip_finish_output2); > else > return ip_finish_output2(skb); > @@ -801,7 +802,9 @@ int ip_append_data(struct sock *sk, > inet->cork.addr = ipc->addr; > } > dst_hold(&rt->u.dst); > - inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); > + inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE > ? > + rt->u.dst.dev->mtu : > + dst_mtu(rt->u.dst.path); > inet->cork.rt = rt; > inet->cork.length = 0; > sk->sk_sndmsg_page = NULL; > @@ -1220,13 +1223,16 @@ int ip_push_pending_frames(struct sock *sk) > * to fragment the frame generated here. No matter, what transforms > * how transforms change size of the packet, it will come out. > */ > - if (inet->pmtudisc != IP_PMTUDISC_DO) > + if (inet->pmtudisc < IP_PMTUDISC_DO) > skb->local_df = 1; > > + if (inet->pmtudisc == IP_PMTUDISC_PROBE) > + skb->ign_dst_mtu = 1; > + > /* DF bit is set when we want to see DF on outgoing frames. > * If local_df is set too, we still allow to fragment this frame > * locally. */ > - if (inet->pmtudisc == IP_PMTUDISC_DO || > + if (inet->pmtudisc >= IP_PMTUDISC_DO || > (skb->len <= dst_mtu(&rt->u.dst) && > ip_dont_fragment(sk, &rt->u.dst))) > df = htons(IP_DF); > diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c > index 23048d9..98fa088 100644 > --- a/net/ipv4/ip_sockglue.c > +++ b/net/ipv4/ip_sockglue.c > @@ -536,7 +536,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, > inet->hdrincl = val ? 1 : 0; > break; > case IP_MTU_DISCOVER: > - if (val<0 || val>2) > + if (val<0 || val>3) > goto e_inval; > inet->pmtudisc = val; > break; > diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c > index f252f4e..f562262 100644 > --- a/net/ipv4/raw.c > +++ b/net/ipv4/raw.c > @@ -302,6 +302,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, > size_t length, > if (err) > goto error_fault; > > + if (inet->pmtudisc == IP_PMTUDISC_PROBE) > + skb->ign_dst_mtu = 1; > + > /* We don't modify invalid header */ > if (length >= sizeof(*iph) && iph->ihl * 4U <= length) { > if (!iph->saddr) > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c > index 711dfc3..8b8c04b 100644 > --- a/net/ipv6/ip6_output.c > +++ b/net/ipv6/ip6_output.c > @@ -139,8 +139,8 @@ static int ip6_output2(struct sk_buff *skb) > > int ip6_output(struct sk_buff *skb) > { > - if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) || > - dst_allfrag(skb->dst)) > + if ((skb->len > dst_mtu(skb->dst) && !skb->ign_dst_mtu && > + !skb_is_gso(skb)) || dst_allfrag(skb->dst)) > return ip6_fragment(skb, ip6_output2); > else > return ip6_output2(skb); > @@ -574,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int > (*output)(struct sk_buff *)) > hlen = ip6_find_1stfragopt(skb, &prevhdr); > nexthdr = *prevhdr; > > - mtu = dst_mtu(&rt->u.dst); > + mtu = skb->ign_dst_mtu ? skb->len : dst_mtu(&rt->u.dst); > if (np && np->frag_size < mtu) { > if (np->frag_size) > mtu = np->frag_size; > @@ -1015,7 +1015,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void > *from, char *to, > inet->cork.fl = *fl; > np->cork.hop_limit = hlimit; > np->cork.tclass = tclass; > - mtu = dst_mtu(rt->u.dst.path); > + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? > + rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path); > if (np->frag_size < mtu) { > if (np->frag_size) > mtu = np->frag_size; > @@ -1303,6 +1304,9 @@ int ip6_push_pending_frames(struct sock *sk) > tmp_skb->sk = NULL; > } > > + if (np->pmtudisc == IPV6_PMTUDISC_PROBE) > + skb->ign_dst_mtu = 1; > + > ipv6_addr_copy(final_dst, &fl->fl6_dst); > __skb_pull(skb, skb->h.raw - skb->nh.raw); > if (opt && opt->opt_flen) > diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c > index f5f9582..6e88597 100644 > --- a/net/ipv6/ipv6_sockglue.c > +++ b/net/ipv6/ipv6_sockglue.c > @@ -694,7 +694,7 @@ done: > retv = ip6_ra_control(sk, val, NULL); > break; > case IPV6_MTU_DISCOVER: > - if (val<0 || val>2) > + if (val<0 || val>3) > goto e_inval; > np->pmtudisc = val; > retv = 0; > diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c > index 75db277..9ef0946 100644 > --- a/net/ipv6/raw.c > +++ b/net/ipv6/raw.c > @@ -587,6 +587,9 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, > int length, > if (err) > goto error_fault; > > + if (np->pmtudisc == IPV6_PMTUDISC_PROBE) > + skb->ign_dst_mtu = 1; > + > IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); > err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, > dst_output); > -- > 1.5.0.2.gc260-dirty > > - > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html