This provides the 'back_to_kernel' datapath action which has one of two effects:
- For packets originating from the kernel, sets a flag allowing the kernel hook to return it to the kernel. - For packets that came from userspace, re-inserts the packet into the kernel input queue. Signed-off-by: Chris Luke <[email protected]> --- datapath/actions.c | 40 ++++++++++++++++++ datapath/datapath.c | 23 +++++++++-- datapath/datapath.h | 9 +++- datapath/flow_netlink.c | 6 ++- datapath/vport-netdev.c | 96 +++++++++++++++++++++++++++++++++++++------ datapath/vport.c | 26 +++++++++++- datapath/vport.h | 4 +- include/linux/openvswitch.h | 1 + lib/dpif-netdev.c | 1 + lib/dpif.c | 1 + lib/odp-execute.c | 1 + lib/odp-util.c | 12 ++++++ tests/odp.at | 1 + 13 files changed, 201 insertions(+), 20 deletions(-) diff --git a/datapath/actions.c b/datapath/actions.c index 30ea1d2..2476af9 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -405,6 +405,34 @@ static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) return 0; } +static int do_insert(struct datapath *dp, struct sk_buff *skb) +{ + struct vport *vport; + int error = 0; + + if (unlikely(!skb)) + return -ENOMEM; + + if (likely(OVS_CB(skb)->pkt_from_kernel)) { + /* Since we got this packet from the kernel + * we can simply set a flag to return it to + * the kernel. */ + OVS_CB(skb)->return_pkt_to_kernel = true; + } else { + /* We got this packet from userspace so we + * need to insert this into the network input + * queue. */ + vport = ovs_vport_rcu(dp, OVS_CB(skb)->flow->key.phy.in_port); + if (unlikely(!vport)) { + kfree_skb(skb); + return -ENODEV; + } + + error = ovs_vport_insert(vport, skb_clone(skb, GFP_ATOMIC)); + } + return error; +} + static int output_userspace(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr) { @@ -552,6 +580,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, a); + /* If a sampled output action has us send packets + * back to the kernel, we need to keep the skb around. */ + if (OVS_CB(skb)->return_pkt_to_kernel) + keep_skb = true; + break; + + case OVS_ACTION_ATTR_BACK_TO_KERNEL: + do_insert(dp, skb); + /* If we need to return the packet to + * the kernel, keep the skb for it. */ + if (OVS_CB(skb)->return_pkt_to_kernel) + keep_skb = true; break; } diff --git a/datapath/datapath.c b/datapath/datapath.c index b42fd8b..6649045 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -215,7 +215,7 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +int ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) { struct datapath *dp = p->dp; struct sw_flow *flow; @@ -231,7 +231,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) error = ovs_flow_extract(skb, p->port_no, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } /* Look up flow. */ @@ -253,7 +253,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) OVS_CB(skb)->pkt_key = &key; ovs_flow_stats_update(OVS_CB(skb)->flow, skb); - ovs_execute_actions(dp, skb); + error = ovs_execute_actions(dp, skb); stats_counter = &stats->n_hit; out: @@ -262,6 +262,8 @@ out: (*stats_counter)++; stats->n_mask_hit += n_mask_hit; u64_stats_update_end(&stats->sync); + + return error; } static struct genl_family dp_packet_genl_family = { @@ -544,6 +546,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) OVS_CB(packet)->pkt_key = &flow->key; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; + OVS_CB(packet)->pkt_from_kernel = false; rcu_read_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); @@ -551,6 +554,20 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!dp) goto err_unlock; + /* Work out the dev of the original input port */ + if (flow->key.phy.in_port != DP_MAX_PORTS && + flow->key.phy.in_port != OVSP_LOCAL) { + struct vport *vport; + vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (vport != NULL) + packet->dev = dev_get_by_name(sock_net(skb->sk), + vport->ops->get_name(vport)); + if (packet->dev != NULL) { + packet->skb_iif = packet->dev->ifindex; + packet->protocol = eth_type_trans(packet, packet->dev); + } + } + local_bh_disable(); err = ovs_execute_actions(dp, packet); local_bh_enable(); diff --git a/datapath/datapath.h b/datapath/datapath.h index b3ae7cd..c012d2a 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -100,11 +100,18 @@ struct datapath { * @pkt_key: The flow information extracted from the packet. Must be nonnull. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. + * @pkt_from_kernel: True if this packet was handed to us by the kernel, false + * if it came from userspace. + * @return_pkt_to_kernel: True if the action of a matching flow wants us to + * give this packet back to the kernel. Only relevant if it came from the + * kernel. */ struct ovs_skb_cb { struct sw_flow *flow; struct sw_flow_key *pkt_key; struct ovs_key_ipv4_tunnel *tun_key; + bool pkt_from_kernel; + bool return_pkt_to_kernel; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -186,7 +193,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_multicast_group ovs_dp_vport_multicast_group; -void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +int ovs_dp_process_received_packet(struct vport *, struct sk_buff *); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c index 9b26528..7d3519a 100644 --- a/datapath/flow_netlink.c +++ b/datapath/flow_netlink.c @@ -1515,7 +1515,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_BACK_TO_KERNEL] = 0, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1567,6 +1568,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_BACK_TO_KERNEL: + break; + default: return -EINVAL; } diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c index c15923b..196c851 100644 --- a/datapath/vport-netdev.c +++ b/datapath/vport-netdev.c @@ -27,6 +27,10 @@ #include <linux/skbuff.h> #include <linux/openvswitch.h> +#ifdef CONFIG_NET_CLS_ACT +#include <uapi/linux/pkt_cls.h> +#endif + #include <net/llc.h> #include "datapath.h" @@ -34,7 +38,7 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb); +static bool netdev_port_receive(struct vport *vport, struct sk_buff *skb); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39) /* Called with rcu_read_lock and bottom-halves disabled. */ @@ -48,7 +52,10 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) vport = ovs_netdev_get_vport(skb->dev); - netdev_port_receive(vport, skb); + if (netdev_port_receive(vport, skb)) { + /* Tell the kernel we didn't want it. */ + return RX_HANDLER_PASS; + } return RX_HANDLER_CONSUMED; } @@ -64,7 +71,10 @@ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb) vport = ovs_netdev_get_vport(skb->dev); - netdev_port_receive(vport, skb); + if (netdev_port_receive(vport, skb)) { + /* Tell the kernel we didn't want it. */ + return skb; + } return NULL; } @@ -189,31 +199,51 @@ const char *ovs_netdev_get_name(const struct vport *vport) return netdev_vport->dev->name; } -/* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +/* Must be called with rcu_read_lock. + * Returns true if we want the hook to return the packet to the kernel. + */ +static bool netdev_port_receive(struct vport *vport, struct sk_buff *skb) { + int error; + if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; - /* Make our own copy of the packet. Otherwise we will mangle the - * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). - * (No one comes after us, since we tell handle_bridge() that we took - * the packet.) */ + /* Make a clone of the skb if someone else has a reference to it so + * nothing we do with it interferes with anyone higher up the chain. + */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) - return; + return false; skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, NULL); - return; + OVS_CB(skb)->pkt_from_kernel = true; + OVS_CB(skb)->return_pkt_to_kernel = false; + + error = ovs_vport_receive(vport, skb, NULL); + if (unlikely(error)) { + /* If we encountered an error, then the skb + * has been freed and we cannot look at it + * safely anymore. */ + return false; + } + + if (OVS_CB(skb)->return_pkt_to_kernel) { + /* Clean up the skb */ + skb->protocol = eth_type_trans(skb, skb->dev); + return true; + } + + return false; error: kfree_skb(skb); + return false; } static unsigned int packet_length(const struct sk_buff *skb) @@ -250,6 +280,47 @@ drop: return 0; } +/* Must be called with rcu_read_lock. */ +static int netdev_insert(struct sk_buff *skb) +{ + int len, ret; + + if (unlikely(skb == NULL || skb->dev == NULL)) + return -EINVAL; + + /* The skb will have gone when we want this later. */ + len = skb->len; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) +#ifdef CONFIG_NET_CLS_ACT + /* This flag will skip the top half of the + * code in __netif_receive_skb(). */ + skb->tc_verd = TC_NCLS; +#else +#warning Without kernel option CONFIG_NET_CLS_ACT some 'back_to_kernel' \ + packets may deliver twice in AF_PACKET 'ALL' listeners, \ + such as 'tcpdump' or 'lldpd'. +#endif /* CONFIG_NET_CLS_ACT */ + + /* Send it! */ + ret = netif_rx_ni(skb); + +#else + /* Unsupported kernel version. */ + kfree_skb(skb); + return -EINVAL; + +#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) */ + + if (likely(ret == NET_RX_SUCCESS)) { + return len; + } else if(ret == NET_RX_DROP) { + return 0; + } + + return -EINVAL; +} + /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { @@ -278,6 +349,7 @@ const struct vport_ops ovs_netdev_vport_ops = { .destroy = netdev_destroy, .get_name = ovs_netdev_get_name, .send = netdev_send, + .insert = netdev_insert, }; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) && \ diff --git a/datapath/vport.c b/datapath/vport.c index 7f12acc..f5dfcd6 100644 --- a/datapath/vport.c +++ b/datapath/vport.c @@ -358,8 +358,9 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. The caller must have already * called compute_ip_summed() to initialize the checksumming fields. + * Returns 0 on success, or -errno otherwise. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { struct pcpu_tstats *stats; @@ -371,7 +372,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->tun_key = tun_key; - ovs_dp_process_received_packet(vport, skb); + return ovs_dp_process_received_packet(vport, skb); } /** @@ -406,6 +407,27 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) } /** + * ovs_vport_insert - send a packet to the input queue + * + * @vport: vport from which to insert the packet + * @skb: skb to send + * + * Sends the given packet and returns the number of bytes data sent. + * Returns 0 if the packet was dropped and -errno for any errors. + * rcu_read_lock must be held. + */ +int ovs_vport_insert(struct vport *vport, struct sk_buff *skb) +{ + if (unlikely(vport->ops->insert == NULL)) + return -EINVAL; + + if (unlikely(skb->dev == NULL)) + return -ENODEV; + + return vport->ops->insert(skb); +} + +/** * ovs_vport_record_error - indicate device error to generic stats layer * * @vport: vport that encountered the error diff --git a/datapath/vport.h b/datapath/vport.h index 2cf2b18..34303dd 100644 --- a/datapath/vport.h +++ b/datapath/vport.h @@ -51,6 +51,7 @@ int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); +int ovs_vport_insert(struct vport *, struct sk_buff *); /* The following definitions are for implementers of vport devices: */ @@ -146,6 +147,7 @@ struct vport_ops { const char *(*get_name)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); + int (*insert)(struct sk_buff *); }; enum vport_err_type { @@ -191,7 +193,7 @@ static inline struct vport *vport_from_priv(const void *priv) return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, +int ovs_vport_receive(struct vport *, struct sk_buff *, struct ovs_key_ipv4_tunnel *); /* List of statically compiled vport implementations. Don't forget to also diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index 5137c2f..5530f91 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -569,6 +569,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + OVS_ACTION_ATTR_BACK_TO_KERNEL, /* No argument. */ __OVS_ACTION_ATTR_MAX }; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index cb64bdc..d9ad2b6 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1804,6 +1804,7 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, case OVS_ACTION_ATTR_POP_MPLS: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_BACK_TO_KERNEL: case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); diff --git a/lib/dpif.c b/lib/dpif.c index 2b79a6e..7b3b97c 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1078,6 +1078,7 @@ dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet, switch ((enum ovs_action_attr)type) { case OVS_ACTION_ATTR_OUTPUT: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_BACK_TO_KERNEL: execute.actions = action; execute.actions_len = NLA_ALIGN(action->nla_len); execute.packet = packet; diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 5b77fa9..c98cfb4 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -195,6 +195,7 @@ odp_execute_actions__(void *dp, struct ofpbuf *packet, struct pkt_metadata *md, /* These only make sense in the context of a datapath. */ case OVS_ACTION_ATTR_OUTPUT: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_BACK_TO_KERNEL: if (dp_execute_action) { /* Allow 'dp_execute_action' to steal the packet data if we do * not need it any more. */ diff --git a/lib/odp-util.c b/lib/odp-util.c index 873e05a..be25613 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -81,6 +81,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_POP_MPLS: return sizeof(ovs_be16); case OVS_ACTION_ATTR_SET: return -2; case OVS_ACTION_ATTR_SAMPLE: return -2; + case OVS_ACTION_ATTR_BACK_TO_KERNEL: return 0; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: @@ -424,6 +425,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a) case OVS_ACTION_ATTR_SAMPLE: format_odp_sample_action(ds, a); break; + case OVS_ACTION_ATTR_BACK_TO_KERNEL: + ds_put_format(ds, "back_to_kernel"); + break; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: default: @@ -665,6 +669,14 @@ parse_odp_action(const char *s, const struct simap *port_names, } } + { + int len = strcspn(s, delimiters); + if (strncmp(s, "back_to_kernel", len) == 0) { + nl_msg_put_flag(actions, OVS_ACTION_ATTR_BACK_TO_KERNEL); + return len; + } + } + return -EINVAL; } diff --git a/tests/odp.at b/tests/odp.at index b505345..3c4d9fe 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -243,6 +243,7 @@ pop_vlan sample(sample=9.7%,actions(1,2,3,push_vlan(vid=1,pcp=2))) set(tunnel(tun_id=0xabcdef1234567890,src=1.1.1.1,dst=2.2.2.2,tos=0x0,ttl=64,flags(df,csum,key))) set(tunnel(tun_id=0xabcdef1234567890,src=1.1.1.1,dst=2.2.2.2,tos=0x0,ttl=64,flags(key))) +back_to_kernel ]) AT_CHECK_UNQUOTED([test-odp parse-actions < actions.txt], [0], [`cat actions.txt` -- 1.7.9.5 _______________________________________________ dev mailing list [email protected] http://openvswitch.org/mailman/listinfo/dev
