On 8/3/20 2:52 PM, Stefano Brivio wrote:
> @@ -461,6 +464,91 @@ static inline void iptunnel_xmit_stats(struct net_device 
> *dev, int pkt_len)
>       }
>  }
>  
> +/**
> + * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as 
> needed
> + * @skb:     Buffer being sent by encapsulation, L2 headers expected
> + * @encap_dst:       Destination for tunnel encapsulation (outer IP)
> + * @headroom:        Encapsulation header size, bytes
> + * @reply:   Build matching ICMP or ICMPv6 message as a result
> + *
> + * L2 tunnel implementations that can carry IP and can be directly bridged
> + * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
> + * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be 
> built
> + * based on payload and sent back by the encapsulation itself.
> + *
> + * For routable interfaces, we just need to update the PMTU for the 
> destination.
> + *
> + * Return: 0 if ICMP error not needed, length if built, negative value on 
> error
> + */
> +static inline int skb_tunnel_check_pmtu(struct sk_buff *skb,
> +                                     struct dst_entry *encap_dst,
> +                                     int headroom, bool reply)

Given its size, this is probably better as a function. I believe it can
go into net/ipv4/ip_tunnel_core.c like you have iptunnel_pmtud_build_icmp.


> +{
> +     u32 mtu = dst_mtu(encap_dst) - headroom;
> +
> +     if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
> +         (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
> +             return 0;
> +
> +     skb_dst_update_pmtu_no_confirm(skb, mtu);
> +
> +     if (!reply || skb->pkt_type == PACKET_HOST)
> +             return 0;
> +
> +     if (skb->protocol == htons(ETH_P_IP) && mtu > 576) {

I am surprised the 576 does not have an existing macro.

> +             const struct icmphdr *icmph = icmp_hdr(skb);
> +             const struct iphdr *iph = ip_hdr(skb);
> +
> +             if (iph->frag_off != htons(IP_DF) ||
> +                 ipv4_is_lbcast(iph->daddr) ||
> +                 ipv4_is_multicast(iph->daddr) ||
> +                 ipv4_is_zeronet(iph->saddr) ||
> +                 ipv4_is_loopback(iph->saddr) ||
> +                 ipv4_is_lbcast(iph->saddr) ||
> +                 ipv4_is_multicast(iph->saddr) ||
> +                 (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)))
> +                     return 0;
> +
> +             return iptunnel_pmtud_build_icmp(skb, mtu);
> +     }
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +     if (skb->protocol == htons(ETH_P_IPV6) && mtu > IPV6_MIN_MTU) {
> +             const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> +             int stype = ipv6_addr_type(&ip6h->saddr);
> +             u8 proto = ip6h->nexthdr;
> +             __be16 frag_off;
> +             int offset;
> +
> +             if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
> +                 stype == IPV6_ADDR_LOOPBACK)
> +                     return 0;
> +
> +             offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
> +                                       &frag_off);
> +             if (offset < 0 || (frag_off & htons(~0x7)))
> +                     return 0;
> +
> +             if (proto == IPPROTO_ICMPV6) {
> +                     struct icmp6hdr *icmp6h;
> +
> +                     if (!pskb_may_pull(skb, (skb_network_header(skb) +
> +                                              offset + 1 - skb->data)))
> +                             return 0;
> +
> +                     icmp6h = (struct icmp6hdr *)(skb_network_header(skb) +
> +                                                  offset);
> +                     if (icmpv6_is_err(icmp6h->icmp6_type) ||
> +                         icmp6h->icmp6_type == NDISC_REDIRECT)
> +                             return 0;
> +             }
> +
> +             return iptunnel_pmtud_build_icmp(skb, mtu);
> +     }
> +#endif


separate v4 and v6 code into helpers based on skb->protocol; the mtu
check then becomes part of the version specific helpers.

> +     return 0;
> +}
> +
>  static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
>  {
>       return info + 1;
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index f8b419e2475c..54a3dbf7f512 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -184,6 +184,128 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
>  }
>  EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
>  
> +/**
> + * iptunnel_pmtud_build_icmp() - Build ICMP or ICMPv6 error message for PMTUD
> + * @skb:     Original packet with L2 header
> + * @mtu:     MTU value for ICMP or ICMPv6 error
> + *
> + * Return: length on success, negative error code if message couldn't be 
> built.
> + */
> +int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
> +{
> +     struct ethhdr eh;
> +     int len, err;
> +
> +     if (skb->protocol == htons(ETH_P_IP))
> +             len = ETH_HLEN + sizeof(struct iphdr);
> +     else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6))
> +             len = ETH_HLEN + sizeof(struct ipv6hdr);
> +     else
> +             return 0;
> +
> +     if (!pskb_may_pull(skb, len))
> +             return -EINVAL;
> +
> +     skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
> +     pskb_pull(skb, ETH_HLEN);
> +     skb_reset_network_header(skb);
> +
> +     if (skb->protocol == htons(ETH_P_IP)) {
> +             const struct iphdr *iph = ip_hdr(skb);
> +             struct icmphdr *icmph;
> +             struct iphdr *niph;
> +
> +             err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
> +             if (err)
> +                     return err;
> +
> +             len = skb->len + sizeof(*icmph);
> +             err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
> +             if (err)
> +                     return err;
> +
> +             icmph = skb_push(skb, sizeof(*icmph));
> +             *icmph = (struct icmphdr) {
> +                     .type                   = ICMP_DEST_UNREACH,
> +                     .code                   = ICMP_FRAG_NEEDED,
> +                     .checksum               = 0,
> +                     .un.frag.__unused       = 0,
> +                     .un.frag.mtu            = ntohs(mtu),
> +             };
> +             icmph->checksum = ip_compute_csum(icmph, len);
> +             skb_reset_transport_header(skb);
> +
> +             niph = skb_push(skb, sizeof(*niph));
> +             *niph = (struct iphdr) {
> +                     .ihl                    = sizeof(*niph) / 4u,
> +                     .version                = 4,
> +                     .tos                    = 0,
> +                     .tot_len                = htons(len + sizeof(*niph)),
> +                     .id                     = 0,
> +                     .frag_off               = htons(IP_DF),
> +                     .ttl                    = iph->ttl,
> +                     .protocol               = IPPROTO_ICMP,
> +                     .saddr                  = iph->daddr,
> +                     .daddr                  = iph->saddr,
> +             };
> +             ip_send_check(niph);
> +     }
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> +     else if (skb->protocol == htons(ETH_P_IPV6)) {
> +             const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> +             struct icmp6hdr *icmp6h;
> +             struct ipv6hdr *nip6h;
> +             __wsum csum;
> +
> +             err = pskb_trim(skb, IPV6_MIN_MTU -
> +                                  sizeof(*nip6h) - sizeof(*icmp6h));
> +             if (err)
> +                     return err;
> +
> +             len = skb->len + sizeof(*icmp6h);
> +             err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
> +             if (err)
> +                     return err;
> +
> +             icmp6h = skb_push(skb, sizeof(*icmp6h));
> +             *icmp6h = (struct icmp6hdr) {
> +                     .icmp6_type             = ICMPV6_PKT_TOOBIG,
> +                     .icmp6_code             = 0,
> +                     .icmp6_cksum            = 0,
> +                     .icmp6_mtu              = htonl(mtu),
> +             };
> +             skb_reset_transport_header(skb);
> +
> +             nip6h = skb_push(skb, sizeof(*nip6h));
> +             *nip6h = (struct ipv6hdr) {
> +                     .priority               = 0,
> +                     .version                = 6,
> +                     .flow_lbl               = { 0 },
> +                     .payload_len            = htons(len),
> +                     .nexthdr                = IPPROTO_ICMPV6,
> +                     .hop_limit              = ip6h->hop_limit,
> +                     .saddr                  = ip6h->daddr,
> +                     .daddr                  = ip6h->saddr,
> +             };
> +
> +             csum = csum_partial(icmp6h, len, 0);
> +             icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr,
> +                                                   &nip6h->daddr, len,
> +                                                   IPPROTO_ICMPV6, csum);
> +     }
> +#endif
> +
> +     skb_reset_network_header(skb);
> +     skb->ip_summed = CHECKSUM_NONE;
> +
> +     eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
> +     skb_reset_mac_header(skb);
> +
> +     return skb->len;
> +}
> +EXPORT_SYMBOL(iptunnel_pmtud_build_icmp);

I think separate v4 and v6 versions would be more readable; the
duplication is mostly skb manipulation.

Reply via email to