This is a first stab at kernel datapath meter support. Only drop band type is implemented.
Signed-off-by: Jarno Rajahalme <jrajaha...@nicira.com> --- datapath/actions.c | 4 + datapath/datapath.c | 558 +++++++++++++++++++++++++++++++++++++++++++++++ datapath/datapath.h | 32 +++ datapath/flow_netlink.c | 5 + 4 files changed, 599 insertions(+) diff --git a/datapath/actions.c b/datapath/actions.c index 30ea1d2..7795a62 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -528,6 +528,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } switch (nla_type(a)) { + case OVS_ACTION_ATTR_METER: + ovs_execute_meter_action(dp, skb, nla_get_u32(a)); + break; + case OVS_ACTION_ATTR_OUTPUT: prev_port = nla_get_u32(a); break; diff --git a/datapath/datapath.c b/datapath/datapath.c index e4aa672..e86c044 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -159,10 +159,13 @@ static int get_dpifindex(struct datapath *dp) return ifindex; } +static void destroy_meters(struct datapath *dp); + static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); + destroy_meters(dp); ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); release_net(ovs_dp_get_net(dp)); @@ -1199,6 +1202,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); + for (i = 0; i < DP_N_METER_LOCKS; i++) + spin_lock_init(&dp->meter_locks[i]); + /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; @@ -1773,6 +1779,555 @@ static struct genl_ops dp_vport_genl_ops[] = { }, }; +bool +ovs_execute_meter_action(struct datapath *dp, struct sk_buff *skb, + u32 meter_id) +{ + struct dp_meter *meter; + struct dp_meter_band *band; + long long int now = jiffies_to_msecs(jiffies); + long long int long_delta_t; /* msec */ + u32 delta_t; /* msec */ + u32 volume; + int i, band_exceeded_max = -1; + u32 band_exceeded_rate = 0; + spinlock_t *lock = METER_LOCK(dp, meter_id); + + if (meter_id >= DP_MAX_METERS) + return false; + + /* Lock the meter so that it will not be swapped, deleted, or modified + * while we work with it. */ + spin_lock(lock); + meter = dp->meters[meter_id]; + if (!meter) + goto out; + + long_delta_t = (now - meter->used); /* msec */ + + /* Make sure delta_t will not be too large, so that bucket will not + * wrap around below. */ + delta_t = (long_delta_t > (long long int)meter->max_delta_t) + ? meter->max_delta_t : (u32)long_delta_t; + + /* Update meter statistics. */ + meter->used = now; + meter->stats.n_packets += 1; + meter->stats.n_bytes += skb->len; + + /* Bucket rate is either in kilobits per second, or in packets per + * second. We maintain the bucket in the units of either bits or + * 1/1000th of a packet, correspondingly. + * Then, when rate is multiplied with milliseconds, we get the + * bucket units: + * msec * kbps = bits, and + * msec * packets/sec = 1/1000 packets. + * + * 'volume' is the number of bucket units in this packet. */ + volume = (meter->kbps) ? skb->len * 8 : 1000; + + /* Update all bands and find the one hit with the highest rate. */ + for (i = 0; i < meter->n_bands; ++i) { + band = &meter->bands[i]; + + band->bucket += delta_t * band->rate; + if (band->bucket > band->burst_size) + band->bucket = band->burst_size; + + if (band->bucket >= volume) + band->bucket -= volume; + else if (band->rate > band_exceeded_rate) { + band_exceeded_rate = band->rate; + band_exceeded_max = i; + } + } + + if (band_exceeded_max < 0) { + goto out; /* No band exceeded, do nothing. */ + } + + /* Update band statistics. */ + band = &meter->bands[band_exceeded_max]; + band->stats.n_packets += 1; + band->stats.n_bytes += skb->len; + + if (band->type == OVS_METER_BAND_TYPE_DROP) { + spin_unlock(lock); + return true; /* drop */ + } + out: + spin_unlock(lock); + return false; +} + +static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { + [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, + [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, + [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, + [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, + [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, + [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { + [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, +}; + +static struct genl_family dp_meter_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_METER_FAMILY, + .version = OVS_METER_VERSION, + .maxattr = OVS_METER_ATTR_MAX, + .netnsok = true, + SET_PARALLEL_OPS +}; + +struct sk_buff *ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct sk_buff *skb; + struct ovs_header *ovs_header = info->userhdr; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &dp_meter_genl_family, 0, cmd); + if (!ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, + u32 meter_id, + struct dp_meter *meter) +{ + struct nlattr *nla; + struct dp_meter_band *band; + u16 i; + + if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) + goto error; + + if (!meter) + return 0; + + if (nla_put(reply, OVS_METER_ATTR_STATS, sizeof(struct ovs_flow_stats), + &meter->stats) || + nla_put_u64(reply, OVS_METER_ATTR_USED, meter->used)) + goto error; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto error; + + band = meter->bands; + + for (i = 0; i < meter->n_bands; ++i, ++band) { + struct nlattr *band_nla; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, + sizeof(struct ovs_flow_stats), + &band->stats)) + goto error; + nla_nest_end(reply, band_nla); + } + nla_nest_end(reply, nla); + + return 0; +error: + return -EMSGSIZE; +} + +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) +{ + struct datapath *dp; + struct ovs_header *ovs_header = info->userhdr; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct nlattr *nla, *band_nla; + int err; + + /* Check that the datapath exists */ + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + ovs_unlock(); + if (!dp) + return -ENODEV; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, + &ovs_reply_header); + if (!reply) + return PTR_ERR(reply); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, DP_MAX_METERS) || + nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + goto nla_put_failure; + + nla = nla_nest_start(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto nla_put_failure; + + band_nla = nla_nest_start(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla) + goto nla_put_failure; + /* Currently only DROP band type is supported. */ + if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) + goto nla_put_failure; + nla_nest_end(reply, band_nla); + nla_nest_end(reply, nla); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +nla_put_failure: + nlmsg_free(reply); + err = -EMSGSIZE; + return err; +} + +static void destroy_meters(struct datapath *dp) +{ + int i; + + for (i = 0; i < DP_MAX_METERS; i++) + if (dp->meters[i]) { + kfree(dp->meters[i]); + dp->meters[i] = NULL; + } +} + +/* Must be called with ovs_lock() held. */ +static struct dp_meter * dp_meter_swap(struct datapath *dp, u32 meter_id, + struct dp_meter *meter) +{ + struct dp_meter *old_meter; + spinlock_t *lock = METER_LOCK(dp, meter_id); + + /* Spin lock keeps us from swapping a meter while it is being + * used. */ + spin_lock_bh(lock); + old_meter = dp->meters[meter_id]; + + /* Keep meter counts if requested. */ + if (meter && old_meter && meter->keep_stats) { + meter->used = old_meter->used; + meter->stats.n_packets += old_meter->stats.n_packets; + meter->stats.n_bytes += old_meter->stats.n_bytes; + } + + dp->meters[meter_id] = meter; + spin_unlock_bh(lock); + + return old_meter; +} + +/* Must be called with ovs_lock() held. + * Returns the old_meter, or NULL. */ +static struct dp_meter * dp_meter_del(struct datapath *dp, u32 meter_id) +{ + /* Keep free meter index as low as possible */ + if (meter_id < dp->meter_free) + dp->meter_free = meter_id; + + return dp_meter_swap(dp, meter_id, NULL); +} + +/* Must be called with ovs_lock() held. + * Returns the old_meter, or NULL. */ +static struct dp_meter * dp_meter_set(struct datapath *dp, u32 meter_id, + struct dp_meter *meter) +{ + if (meter_id == dp->meter_free) { /* dp->meter_free now taken. */ + u32 next_free = meter_id; + do { + if (++next_free >= DP_MAX_METERS) { /* Wrap around. */ + next_free = 0; + } + if (next_free == dp->meter_free) { /* Full circle. */ + next_free = DP_MAX_METERS; + break; + } + } while (dp->meters[next_free]); + dp->meter_free = next_free; /* May be DP_MAX_METERS. */ + } + + return dp_meter_swap(dp, meter_id, meter); +} + +static struct dp_meter * dp_meter_create(struct nlattr **a) +{ + struct nlattr *nla; + int rem; + u16 n_bands = 0; + struct dp_meter *meter; + struct dp_meter_band *band; + int err; + + /* Validate attributes, count the bands. */ + if (!a[OVS_METER_ATTR_BANDS]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) + if (++n_bands > DP_MAX_BANDS) + return ERR_PTR(-EINVAL); + + /* Allocate and set up the meter before locking anything. */ + meter = kzalloc(sizeof *meter + + n_bands * sizeof(struct dp_meter_band), GFP_KERNEL); + if (!meter) + return ERR_PTR(-ENOMEM); + + meter->used = jiffies_to_msecs(jiffies); + meter->kbps = a[OVS_METER_ATTR_KBPS] != NULL; + meter->keep_stats = a[OVS_METER_ATTR_CLEAR] == NULL; + if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { + meter->stats = *(struct ovs_flow_stats *) + nla_data(a[OVS_METER_ATTR_STATS]); + } + meter->n_bands = n_bands; + + /* Set up meter bands. */ + band = meter->bands; + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { + struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; + u32 band_max_delta_t; + + err = nla_parse((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, + nla_data(nla), nla_len(nla), band_policy); + if (err) + goto exit_free_meter; + + if (!attr[OVS_BAND_ATTR_TYPE] || + !attr[OVS_BAND_ATTR_RATE] || + !attr[OVS_BAND_ATTR_BURST] || + nla_get_u32(attr[OVS_BAND_ATTR_TYPE]) + != OVS_METER_BAND_TYPE_DROP) { + err = -EINVAL; + goto exit_free_meter; + } + + band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); + band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + /* Convert burst size to the bucket units: + * pkts => 1/1000 packets, kilobits => bits. */ + band->burst_size + = nla_get_u32(attr[OVS_BAND_ATTR_BURST]) * 1000; + /* Start with an empty bucket. */ + band->bucket = 0; + /* Figure out max delta_t that is enough to fill any bucket. */ + band_max_delta_t = band->burst_size / band->rate; + if (band_max_delta_t > meter->max_delta_t) + meter->max_delta_t = band_max_delta_t; + band++; + } + + return meter; + +exit_free_meter: + kfree(meter); + return ERR_PTR(err); +} + +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct dp_meter *meter, *old_meter; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_header *ovs_header = info->userhdr; + struct datapath *dp; + int err; + u32 meter_id; + bool failed; + + meter = dp_meter_create(a); + if (IS_ERR_OR_NULL(meter)) + return PTR_ERR(meter); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto exit_free_meter; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + meter_id = a[OVS_METER_ATTR_ID] ? nla_get_u32(a[OVS_METER_ATTR_ID]) : + dp->meter_free; + + if (meter_id >= DP_MAX_METERS) { + err = -EFBIG; + goto exit_unlock; + } + + /* Cannot fail after this. */ + old_meter = dp_meter_set(dp, meter_id, meter); + ovs_unlock(); + + /* Build response with the meter_id and stats from the old meter, + if any. */ + failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); + BUG_ON(failed); + err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); + if (old_meter) + kfree(old_meter); + BUG_ON(err); /* XXX: Make sure we have reserved enough space! */ + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); +exit_free_meter: + kfree(meter); + return err; +} + +static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + spinlock_t *lock; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + if (meter_id >= DP_MAX_METERS) + return -EFBIG; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + /* Locate meter, copy stats. */ + meter = dp->meters[meter_id]; + if (!meter) { + err = -ENOENT; + goto exit_unlock; + } + + lock = METER_LOCK(dp, meter_id); + spin_lock_bh(lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + spin_unlock_bh(lock); + if (err) + goto exit_unlock; + + ovs_unlock(); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + u32 meter_id; + struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_reply_header; + struct datapath *dp; + int err; + struct sk_buff *reply; + struct dp_meter *meter; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + if (meter_id >= DP_MAX_METERS) + return -EFBIG; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + meter = dp_meter_del(dp, meter_id); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + BUG_ON(err); + + if (meter) + kfree(meter); + + ovs_unlock(); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static struct genl_ops dp_meter_genl_ops[] = { + { .cmd = OVS_METER_CMD_FEATURES, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_features + }, + { .cmd = OVS_METER_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_set, + }, + { .cmd = OVS_METER_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_get, + }, + { .cmd = OVS_METER_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = meter_policy, + .doit = ovs_meter_cmd_del + }, +}; + struct genl_family_and_ops { struct genl_family *family; struct genl_ops *ops; @@ -1793,6 +2348,9 @@ static const struct genl_family_and_ops dp_genl_families[] = { { &dp_packet_genl_family, dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), NULL }, + { &dp_meter_genl_family, + dp_meter_genl_ops, ARRAY_SIZE(dp_meter_genl_ops), + NULL }, }; static void dp_unregister_genl(int n_families) diff --git a/datapath/datapath.h b/datapath/datapath.h index 879a830..8055cc8 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -37,6 +37,30 @@ #define SAMPLE_ACTION_DEPTH 3 +#define DP_MAX_METERS 65536 +#define DP_MAX_BANDS 1 +#define DP_N_METER_LOCKS 17 + +#define METER_LOCK(DP, METER_ID) \ + &(DP)->meter_locks[(METER_ID) % DP_N_METER_LOCKS] + +struct dp_meter_band { + u32 type; + u32 rate; + u32 burst_size; + u32 bucket; /* 1/1000 packets, or in bits */ + struct ovs_flow_stats stats; +}; + +struct dp_meter { + u16 kbps:1, keep_stats:1; + u16 n_bands; + u32 max_delta_t; + u64 used; + struct ovs_flow_stats stats; + struct dp_meter_band bands[]; +}; + /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. @@ -90,6 +114,11 @@ struct datapath { /* Network namespace ref. */ struct net *net; #endif + + /* Meters. */ + spinlock_t meter_locks[DP_N_METER_LOCKS]; + struct dp_meter *meters[DP_MAX_METERS]; /* Meter bands. */ + uint32_t meter_free; /* Next free meter. */ }; /** @@ -197,4 +226,7 @@ void ovs_dp_notify_wq(struct work_struct *work); #define OVS_NLERR(fmt, ...) \ pr_info_once("netlink: " fmt, ##__VA_ARGS__) +bool ovs_execute_meter_action(struct datapath *dp, struct sk_buff *skb, + uint32_t meter_id); + #endif /* datapath.h */ diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c index 75c72b3..5276b7e 100644 --- a/datapath/flow_netlink.c +++ b/datapath/flow_netlink.c @@ -1462,6 +1462,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, nla_for_each_nested(a, attr, rem) { /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_METER] = sizeof(u32), [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), @@ -1483,6 +1484,10 @@ int ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_UNSPEC: return -EINVAL; + case OVS_ACTION_ATTR_METER: + /* Non-existent meters are simply ignored. */ + break; + case OVS_ACTION_ATTR_USERSPACE: err = validate_userspace(a); if (err) -- 1.7.10.4 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev