Hello, On Thu, 25 May 2017, Eric Dumazet wrote:
> From: Eric Dumazet <eduma...@google.com> > > Andrey Konovalov reported crashes in ipv4_mtu() > > I could reproduce the issue with KASAN kernels, between > 10.246.7.151 and 10.246.7.152 : > > 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & > > 2) At the same time run following loop : > while : > do > ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > done > > > Cong Wang attempted to add back rt->fi in commit > 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") > but this proved to add some issues that were complex to solve. > > Instead, I suggested to add a refcount to the metrics themselves, > being a standalone object (in particular, no reference to other objects) > > I tried to make this patch as small as possible to ease its backport, > instead of being super clean. Note that we believe that only ipv4 dst > need to take care of the metric refcount. But if this is wrong, > this patch adds the basic infrastructure to extend this to other > families. > > Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang > for his efforts on this problem. > > Fixes: 2860583fe840 ("ipv4: Kill rt->fi") > Signed-off-by: Eric Dumazet <eduma...@google.com> > Reported-by: Andrey Konovalov <andreyk...@google.com> Nice work, thanks! Reviewed-by: Julian Anastasov <j...@ssi.bg> > --- > include/net/dst.h | 8 +++++++- > include/net/ip_fib.h | 10 +++++----- > net/core/dst.c | 23 ++++++++++++++--------- > net/ipv4/fib_semantics.c | 17 ++++++++++------- > net/ipv4/route.c | 10 +++++++++- > 5 files changed, 45 insertions(+), 23 deletions(-) > > diff --git a/include/net/dst.h b/include/net/dst.h > index > 049af33da3b6c95897d544670cea65c542317673..cfc0437841665d7ed46a714915c50d723c24901c > 100644 > --- a/include/net/dst.h > +++ b/include/net/dst.h > @@ -107,10 +107,16 @@ struct dst_entry { > }; > }; > > +struct dst_metrics { > + u32 metrics[RTAX_MAX]; > + atomic_t refcnt; > +}; > +extern const struct dst_metrics dst_default_metrics; > + > u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); > -extern const u32 dst_default_metrics[]; > > #define DST_METRICS_READ_ONLY 0x1UL > +#define DST_METRICS_REFCOUNTED 0x2UL > #define DST_METRICS_FLAGS 0x3UL > #define __DST_METRICS_PTR(Y) \ > ((u32 *)((Y) & ~DST_METRICS_FLAGS)) > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h > index > 6692c5758b332d468f1e0611ecc4f3e03ae03b2b..f7f6aa789c6174c41ca9739206d586c559c1f3a1 > 100644 > --- a/include/net/ip_fib.h > +++ b/include/net/ip_fib.h > @@ -114,11 +114,11 @@ struct fib_info { > __be32 fib_prefsrc; > u32 fib_tb_id; > u32 fib_priority; > - u32 *fib_metrics; > -#define fib_mtu fib_metrics[RTAX_MTU-1] > -#define fib_window fib_metrics[RTAX_WINDOW-1] > -#define fib_rtt fib_metrics[RTAX_RTT-1] > -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] > + struct dst_metrics *fib_metrics; > +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] > +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] > +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] > +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] > int fib_nhs; > #ifdef CONFIG_IP_ROUTE_MULTIPATH > int fib_weight; > diff --git a/net/core/dst.c b/net/core/dst.c > index > 960e503b5a529a2c4f1866f49c150493ee98d7da..6192f11beec9077de964e2aeff4f78547f08b8da > 100644 > --- a/net/core/dst.c > +++ b/net/core/dst.c > @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, > struct sk_buff *skb) > } > EXPORT_SYMBOL(dst_discard_out); > > -const u32 dst_default_metrics[RTAX_MAX + 1] = { > +const struct dst_metrics dst_default_metrics = { > /* This initializer is needed to force linker to place this variable > * into const section. Otherwise it might end into bss section. > * We really want to avoid false sharing on this variable, and catch > * any writes on it. > */ > - [RTAX_MAX] = 0xdeadbeef, > + .refcnt = ATOMIC_INIT(1), > }; > > void dst_init(struct dst_entry *dst, struct dst_ops *ops, > @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, > if (dev) > dev_hold(dev); > dst->ops = ops; > - dst_init_metrics(dst, dst_default_metrics, true); > + dst_init_metrics(dst, dst_default_metrics.metrics, true); > dst->expires = 0UL; > dst->path = dst; > dst->from = NULL; > @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); > > u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) > { > - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); > + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); > > if (p) { > - u32 *old_p = __DST_METRICS_PTR(old); > + struct dst_metrics *old_p = (struct dst_metrics > *)__DST_METRICS_PTR(old); > unsigned long prev, new; > > - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); > + atomic_set(&p->refcnt, 1); > + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); > > new = (unsigned long) p; > prev = cmpxchg(&dst->_metrics, old, new); > > if (prev != old) { > kfree(p); > - p = __DST_METRICS_PTR(prev); > + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); > if (prev & DST_METRICS_READ_ONLY) > p = NULL; > + } else if (prev & DST_METRICS_REFCOUNTED) { > + if (atomic_dec_and_test(&old_p->refcnt)) > + kfree(old_p); > } > } > - return p; > + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); > + return (u32 *)p; > } > EXPORT_SYMBOL(dst_cow_metrics_generic); > > @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, > unsigned long old) > { > unsigned long prev, new; > > - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; > + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; > prev = cmpxchg(&dst->_metrics, old, new); > if (prev == old) > kfree(__DST_METRICS_PTR(old)); > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c > index > da449ddb8cc172bd9091c00057a69a095f98b56d..ad9ad4aab5da7c7d11c3b80edbdfcbdd3d7153fe > 100644 > --- a/net/ipv4/fib_semantics.c > +++ b/net/ipv4/fib_semantics.c > @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * > __percpu *rtp) > static void free_fib_info_rcu(struct rcu_head *head) > { > struct fib_info *fi = container_of(head, struct fib_info, rcu); > + struct dst_metrics *m; > > change_nexthops(fi) { > if (nexthop_nh->nh_dev) > @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) > rt_fibinfo_free(&nexthop_nh->nh_rth_input); > } endfor_nexthops(fi); > > - if (fi->fib_metrics != (u32 *) dst_default_metrics) > - kfree(fi->fib_metrics); > + m = fi->fib_metrics; > + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) > + kfree(m); > kfree(fi); > } > > @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct > fib_config *cfg) > val = 255; > if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) > return -EINVAL; > - fi->fib_metrics[type - 1] = val; > + fi->fib_metrics->metrics[type - 1] = val; > } > > if (ecn_ca) > - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; > + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= > DST_FEATURE_ECN_CA; > > return 0; > } > @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config > *cfg) > goto failure; > fib_info_cnt++; > if (cfg->fc_mx) { > - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); > + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); > if (!fi->fib_metrics) > goto failure; > + atomic_set(&fi->fib_metrics->refcnt, 1); > } else > - fi->fib_metrics = (u32 *) dst_default_metrics; > + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; > > fi->fib_net = net; > fi->fib_protocol = cfg->fc_protocol; > @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 > seq, int event, > if (fi->fib_priority && > nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) > goto nla_put_failure; > - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) > + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) > goto nla_put_failure; > > if (fi->fib_prefsrc && > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > index > 655d9eebe43e16a59102edcd3ea4bc177c6b341d..6883b3d4ba8f69de2cb924612d60f5671a219a84 > 100644 > --- a/net/ipv4/route.c > +++ b/net/ipv4/route.c > @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) > > static void ipv4_dst_destroy(struct dst_entry *dst) > { > + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); > struct rtable *rt = (struct rtable *) dst; > > + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) > + kfree(p); > + > if (!list_empty(&rt->rt_uncached)) { > struct uncached_list *ul = rt->rt_uncached_list; > > @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 > daddr, > rt->rt_gateway = nh->nh_gw; > rt->rt_uses_gateway = 1; > } > - dst_init_metrics(&rt->dst, fi->fib_metrics, true); > + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); > + if (fi->fib_metrics != &dst_default_metrics) { > + rt->dst._metrics |= DST_METRICS_REFCOUNTED; > + atomic_inc(&fi->fib_metrics->refcnt); > + } > #ifdef CONFIG_IP_ROUTE_CLASSID > rt->dst.tclassid = nh->nh_tclassid; > #endif Regards -- Julian Anastasov <j...@ssi.bg>