On Mon, May 21, 2018 at 5:16 PM, Yi-Hung Wei <yihung....@gmail.com> wrote: > Currently, nf_conntrack_max is used to limit the maximum number of > conntrack entries in the conntrack table for every network namespace. > For the VMs and containers that reside in the same namespace, > they share the same conntrack table, and the total # of conntrack entries > for all the VMs and containers are limited by nf_conntrack_max. In this > case, if one of the VM/container abuses the usage the conntrack entries, > it blocks the others from committing valid conntrack entries into the > conntrack table. Even if we can possibly put the VM in different network > namespace, the current nf_conntrack_max configuration is kind of rigid > that we cannot limit different VM/container to have different # conntrack > entries. > > To address the aforementioned issue, this patch proposes to have a > fine-grained mechanism that could further limit the # of conntrack entries > per-zone. For example, we can designate different zone to different VM, > and set conntrack limit to each zone. By providing this isolation, a > mis-behaved VM only consumes the conntrack entries in its own zone, and > it will not influence other well-behaved VMs. Moreover, the users can > set various conntrack limit to different zone based on their preference. > > The proposed implementation utilizes Netfilter's nf_conncount backend > to count the number of connections in a particular zone. If the number of > connection is above a configured limitation, ovs will return ENOMEM to the > userspace. If userspace does not configure the zone limit, the limit > defaults to zero that is no limitation, which is backward compatible to > the behavior without this patch. > > The following high leve APIs are provided to the userspace: > - OVS_CT_LIMIT_CMD_SET: > * set default connection limit for all zones > * set the connection limit for a particular zone > - OVS_CT_LIMIT_CMD_DEL: > * remove the connection limit for a particular zone > - OVS_CT_LIMIT_CMD_GET: > * get the default connection limit for all zones > * get the connection limit for a particular zone > > Signed-off-by: Yi-Hung Wei <yihung....@gmail.com>
I have few comments, but otherwise patch looks good. > --- > net/openvswitch/Kconfig | 3 +- > net/openvswitch/conntrack.c | 541 > +++++++++++++++++++++++++++++++++++++++++++- > net/openvswitch/conntrack.h | 9 +- > net/openvswitch/datapath.c | 7 +- > net/openvswitch/datapath.h | 3 + > 5 files changed, 557 insertions(+), 6 deletions(-) > > diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig > index 2650205cdaf9..89da9512ec1e 100644 > --- a/net/openvswitch/Kconfig > +++ b/net/openvswitch/Kconfig > @@ -9,7 +9,8 @@ config OPENVSWITCH > (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ > (!NF_NAT || NF_NAT) && \ > (!NF_NAT_IPV4 || NF_NAT_IPV4) && \ > - (!NF_NAT_IPV6 || NF_NAT_IPV6))) > + (!NF_NAT_IPV6 || NF_NAT_IPV6) && \ > + (!NETFILTER_CONNCOUNT || > NETFILTER_CONNCOUNT))) > select LIBCRC32C > select MPLS > select NET_MPLS_GSO > diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c > index 02fc343feb66..e8bb91420ca9 100644 > --- a/net/openvswitch/conntrack.c > +++ b/net/openvswitch/conntrack.c > @@ -16,8 +16,11 @@ > #include <linux/tcp.h> > #include <linux/udp.h> > #include <linux/sctp.h> > +#include <linux/static_key.h> > #include <net/ip.h> > +#include <net/genetlink.h> > #include <net/netfilter/nf_conntrack_core.h> > +#include <net/netfilter/nf_conntrack_count.h> > #include <net/netfilter/nf_conntrack_helper.h> > #include <net/netfilter/nf_conntrack_labels.h> > #include <net/netfilter/nf_conntrack_seqadj.h> > @@ -76,6 +79,31 @@ struct ovs_conntrack_info { > #endif > }; > > +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) > +#define OVS_CT_LIMIT_UNLIMITED 0 > +#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED > +#define CT_LIMIT_HASH_BUCKETS 512 > +static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); > + > +struct ovs_ct_limit { > + /* Elements in ovs_ct_limit_info->limits hash table */ > + struct hlist_node hlist_node; > + struct rcu_head rcu; > + u16 zone; > + u32 limit; > +}; > + > +struct ovs_ct_limit_info { > + u32 default_limit; > + struct hlist_head *limits; > + struct nf_conncount_data *data __aligned(8); Why does it need explicit alignment attribute? > +}; > + > +static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { > + [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, > +}; > +#endif > + > static bool labels_nonzero(const struct ovs_key_ct_labels *labels); ... > +static int ovs_ct_check_limit(struct net *net, > + const struct ovs_conntrack_info *info, > + const struct nf_conntrack_tuple *tuple) > +{ > + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); > + const struct ovs_ct_limit_info *ct_limit_info = > ovs_net->ct_limit_info; > + u32 per_zone_limit, connections; > + u32 conncount_key[5]; If the key size of single u32, why the array of 5 is defined for the key? > + > + conncount_key[0] = info->zone.id; > + > + per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id); > + if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) > + return 0; > + > + connections = nf_conncount_count(net, ct_limit_info->data, > + conncount_key, tuple, &info->zone); > + if (connections > per_zone_limit) > + return -ENOMEM; > + > + return 0; ... > -void ovs_ct_init(struct net *net) > +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) > +static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) > +{ > + int i, err; > + > + ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), > + GFP_KERNEL); > + if (!ovs_net->ct_limit_info) > + return -ENOMEM; > + > + ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; > + ovs_net->ct_limit_info->limits = > + kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct > hlist_head), > + GFP_KERNEL); > + if (!ovs_net->ct_limit_info->limits) { > + kfree(ovs_net->ct_limit_info); > + return -ENOMEM; > + } > + > + for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) > + INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); > + > + ovs_net->ct_limit_info->data = > + nf_conncount_init(net, NFPROTO_INET, sizeof(u32)); > + > + if (IS_ERR(ovs_net->ct_limit_info->data)) { Can you print error msg, other wise it would be really hard to debug a namespace launch failure due to this issue. > + err = PTR_ERR(ovs_net->ct_limit_info->data); > + kfree(ovs_net->ct_limit_info->limits); > + kfree(ovs_net->ct_limit_info); > + return err; > + } > + return 0; > +} > + .... > +static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, > + struct ovs_ct_limit_info *info) > +{ > + struct ovs_zone_limit *zone_limit; > + int rem; > + u16 zone; > + > + rem = NLA_ALIGN(nla_len(nla_zone_limit)); > + zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); > + > + while (rem >= sizeof(*zone_limit)) { > + if (unlikely(!check_zone_id(zone_limit->zone_id, &zone))) { > + OVS_NLERR(true, "zone id is out of range"); There is no need to check if the port is out of range when we are deleting it. since hash table lookup would fail anyways. > + } else { > + ovs_lock(); > + ct_limit_del(info, zone); > + ovs_unlock(); > + } > + rem -= NLA_ALIGN(sizeof(*zone_limit)); > + zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + > + NLA_ALIGN(sizeof(*zone_limit))); > + } > + This API does not handle delete of default limit. > + if (rem) > + OVS_NLERR(true, "del zone limit has %d unknown bytes", rem); > + > + return 0; > +} > + > +static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, > + struct sk_buff *reply) > +{ > + struct ovs_zone_limit zone_limit; > + int err; > + > + zone_limit.zone_id = -1; This is part of UAPI, Can you define constant in openvswitch.h for default zone id to be -1. > + zone_limit.limit = info->default_limit; > + err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); > + if (err) > + return err; > + > + return 0; > +} > + > +static int ovs_ct_limit_get_zone_limit(struct net *net, > + struct nlattr *nla_zone_limit, > + struct ovs_ct_limit_info *info, > + struct sk_buff *reply) > +{ > + struct nf_conntrack_zone ct_zone; > + struct ovs_zone_limit *zone_limit; > + int rem, err; > + u32 conncount_key[5]; > + u16 zone; > + > + rem = NLA_ALIGN(nla_len(nla_zone_limit)); > + zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); > + > + while (rem >= sizeof(*zone_limit)) { > + if (unlikely(zone_limit->zone_id == -1)) { > + err = ovs_ct_limit_get_default_limit(info, reply); > + if (err) > + return err; > + } else if (unlikely(!check_zone_id(zone_limit->zone_id, > + &zone))) { > + OVS_NLERR(true, "zone id is out of range"); > + } else { > + rcu_read_lock(); > + zone_limit->limit = ct_limit_get(info, zone); > + rcu_read_unlock(); > + > + nf_ct_zone_init(&ct_zone, zone, > NF_CT_DEFAULT_ZONE_DIR, > + 0); > + conncount_key[0] = zone; > + zone_limit->count = nf_conncount_count( > + net, info->data, conncount_key, NULL, > &ct_zone); > + err = nla_put_nohdr(reply, sizeof(*zone_limit), > + zone_limit); > + if (err) > + return err; > + } > + rem -= NLA_ALIGN(sizeof(*zone_limit)); > + zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + > + NLA_ALIGN(sizeof(*zone_limit))); > + } > + > + if (rem) > + OVS_NLERR(true, "get zone limit has %d unknown bytes", rem); > + > + return 0; > +} > + > +static int ovs_ct_limit_get_all_zone_limit(struct net *net, > + struct ovs_ct_limit_info *info, > + struct sk_buff *reply) > +{ > + struct nf_conntrack_zone ct_zone; > + struct ovs_zone_limit zone_limit; > + struct ovs_ct_limit *ct_limit; > + struct hlist_head *head; > + u32 conncount_key[5]; > + int i, err = 0; > + > + err = ovs_ct_limit_get_default_limit(info, reply); > + if (err) > + return err; > + > + rcu_read_lock(); > + for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { > + head = &info->limits[i]; > + hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { > + zone_limit.zone_id = ct_limit->zone; > + zone_limit.limit = ct_limit->limit; > + nf_ct_zone_init(&ct_zone, ct_limit->zone, > + NF_CT_DEFAULT_ZONE_DIR, 0); > + > + conncount_key[0] = ct_limit->zone; > + zone_limit.count = nf_conncount_count(net, info->data, > + conncount_key, NULL, &ct_zone); > + err = nla_put_nohdr(reply, sizeof(zone_limit), > + &zone_limit); > + if (err) > + goto exit_err; Can you write a single helper function to build reply zone_limit object that can be used in ovs_ct_limit_get_zone_limit() and ovs_ct_limit_get_all_zone_limit()? > + } > + } > + > +exit_err: > + rcu_read_unlock(); > + return err; > +} > + ...