On Tue, Jul 12, 2016 at 3:26 PM, Joe Stringer <j...@ovn.org> wrote:
> The core fragmentation handling logic is exported on all supported
> kernels, so it's not necessary to backport the latest version of this.
> This greatly simplifies the code due to inconsistencies between the old
> per-lookup garbage collection and the newer workqueue based garbage
> collection.
>
> As a result of simplifying and removing unnecessary backport code, a few
> bugs are fixed for corner cases such as when some fragments remain in
> the fragment cache when openvswitch is unloaded.
>
> Some backported ip functions need a little extra logic than what is seen
> on the latest code due to this, for instance on kernels <3.17:
> * Call inet_frag_evictor() before defrag
> * Limit hashsize in ip{,6}_fragment logic
>
> The pernet init/exit logic also differs a little from upstream. Upstream
> ipv[46]_defrag logic initializes the various pernet fragment parameters
> and its own global fragments cache. In the OVS backport, the pernet
> parameters are shared while the fragments cache is separate. The
> backport relies upon upstream pernet initialization to perform the
> shared setup, and performs no pernet initialization of its own. When it
> comes to pernet exit however, the backport must ensure that all
> OVS-specific fragment state is cleared, while the shared state remains
> untouched so that the regular ipv[46] logic may do its own cleanup. In
> practice this means that OVS must have its own divergent implementation
> of inet_frags_exit_net().
>
> Fixes the following crash:
>
> Call Trace:
>  <IRQ>
>  [<ffffffff810744f6>] ? call_timer_fn+0x36/0x100
>  [<ffffffff8107548f>] run_timer_softirq+0x1ef/0x2f0
>  [<ffffffff8106cccc>] __do_softirq+0xec/0x2c0
>  [<ffffffff8106d215>] irq_exit+0x105/0x110
>  [<ffffffff81737095>] smp_apic_timer_interrupt+0x45/0x60
>  [<ffffffff81735a1d>] apic_timer_interrupt+0x6d/0x80
>  <EOI>
>  [<ffffffff8104f596>] ? native_safe_halt+0x6/0x10
>  [<ffffffff8101cb2f>] default_idle+0x1f/0xc0
>  [<ffffffff8101d406>] arch_cpu_idle+0x26/0x30
>  [<ffffffff810bf3a5>] cpu_startup_entry+0xc5/0x290
>  [<ffffffff810415ed>] start_secondary+0x21d/0x2d0
> Code:  Bad RIP value.
> RIP  [<ffffffffa0177480>] 0xffffffffa0177480
>  RSP <ffff88003f703e78>
> CR2: ffffffffa0177480
> ---[ end trace eb98ca80ba07bd9c ]---
> Kernel panic - not syncing: Fatal exception in interrupt
>
> Signed-off-by: Joe Stringer <j...@ovn.org>
> ---
> I've tested this on CentOS kernel 3.10.0-327 and Ubuntu kernels
> 3.13.0-68, 3.16.0-70, 3.19.0-58, and 4.2.0-35. Some earlier kernel
> versions may still trigger fragmentation-related crashes due to upstream
> bugs, for instance on Ubuntu's 3.13.0-24.
> ---
>  acinclude.m4                                  |   1 +
>  datapath/linux/compat/include/net/inet_frag.h |  58 ++-
>  datapath/linux/compat/inet_fragment.c         | 486 
> ++------------------------
>  datapath/linux/compat/ip_fragment.c           |  36 +-
>  datapath/linux/compat/nf_conntrack_reasm.c    |  17 +
>  5 files changed, 91 insertions(+), 507 deletions(-)
>
...

> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> -                                  const char *prefix)
> -{
> -       static const char msg[] = "inet_frag_find: Fragment hash bucket"
> -               " list length grew over limit " 
> __stringify(INETFRAGS_MAXDEPTH)
> -               ". Dropping fragment.\n";
> +       local_bh_disable();
> +       inet_frag_evictor(nf, f, true);
> +       local_bh_enable();
>
> -       if (PTR_ERR(q) == -ENOBUFS)
> -               net_dbg_ratelimited("%s%s", prefix, msg);
> +       nf->low_thresh = thresh;
>  }

Upstream inet_frags_exit_net() and back ported looks the same, we can
just use the exported symbol.


> +#endif /* HAVE_INET_FRAGS_WITH_FRAGS_WORK */
>
>  #endif /* !HAVE_CORRECT_MRU_HANDLING */
> diff --git a/datapath/linux/compat/ip_fragment.c 
> b/datapath/linux/compat/ip_fragment.c
> index 8d01088abc0a..64e2cf23c327 100644
> --- a/datapath/linux/compat/ip_fragment.c
> +++ b/datapath/linux/compat/ip_fragment.c
> @@ -76,12 +76,7 @@ struct ipfrag_skb_cb
>
>  /* Describe an entry in the "incomplete datagrams" queue. */
>  struct ipq {
> -       union {
> -               struct inet_frag_queue q;
> -#ifndef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR
> -               struct ovs_inet_frag_queue oq;
> -#endif
> -       };
> +       struct inet_frag_queue q;
>
>         u32             user;
>         __be32          saddr;
> @@ -119,6 +114,12 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, 
> __be32 daddr, u8 prot)
>                             (__force u32)saddr, (__force u32)daddr,
>                             ip4_frags.rnd);
>  }
> +/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers")
> + * shifted this logic into inet_fragment, but prior kernels still need this.
> + */
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0)
> +#define ipqhashfn(a, b, c, d) (ipqhashfn(a, b, c, d) & (INETFRAGS_HASHSZ - 
> 1))
> +#endif
>
>  #ifdef HAVE_INET_FRAGS_CONST
>  static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
> @@ -267,6 +268,23 @@ out:
>         ipq_put(qp);
>  }
>
> +/* Memory limiting on fragments.  Evictor trashes the oldest
> + * fragment queue until we are back under the threshold.
> + *
> + * Necessary for kernels earlier than v3.17. Replaced in commit
> + * b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue").
> + */
> +static void ip_evictor(struct net *net)
> +{
> +#ifdef HAVE_INET_FRAG_EVICTOR
> +       int evicted;
> +
> +       evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
> +       if (evicted)
> +               IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
> +#endif
> +}
> +
>  /* Find the correct entry in the "incomplete datagrams" queue for
>   * this IP datagram, and create new one, if nothing is found.
>   */
> @@ -281,6 +299,11 @@ static struct ipq *ip_find(struct net *net, struct iphdr 
> *iph,
>         arg.user = user;
>         arg.vif = vif;
>
> +       ip_evictor(net);
> +
IP fragment eviction can be done after this lookup. So that there is
better chance of finding the fragment.

> +#ifdef HAVE_INET_FRAGS_WITH_RWLOCK
> +       read_lock(&ip4_frags.lock);
> +#endif
>         hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
>
>         q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
> @@ -701,7 +724,6 @@ int rpl_ip_defrag(struct net *net, struct sk_buff *skb, 
> u32 user)
>         kfree_skb(skb);
>         return -ENOMEM;
>  }
> -EXPORT_SYMBOL_GPL(rpl_ip_defrag);
>
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to