This is excellent. Thanks, Ben.

> -----Original Message-----
> From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of Ben Pfaff
> Sent: Monday, July 28, 2014 11:35 PM
> To: dev@openvswitch.org
> Cc: Ben Pfaff
> Subject: [ovs-dev] [PATCH] netlink-socket: Add conceptual documentation.
> 
> Based on a conversation with the VMware Hyper-V team earlier today.
> 
> This commit also changes a couple of functions that were only used with
> netlink-socket.c into static functions.  I couldn't think of a reason for code
> outside that file to use them.
> 
> Signed-off-by: Ben Pfaff <b...@nicira.com>
> ---
>  lib/netlink-socket.c | 124 +++++++++++++++++++++----------------------
>  lib/netlink-socket.h | 145
> +++++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 197 insertions(+), 72 deletions(-)
> 
> diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index 
> 09d3a61..0ff85d6
> 100644
> --- a/lib/netlink-socket.c
> +++ b/lib/netlink-socket.c
> @@ -537,26 +537,7 @@ nl_sock_transact_multiple__(struct nl_sock *sock,
>      return error;
>  }
> 
> -/* Sends the 'request' member of the 'n' transactions in 'transactions' on
> - * 'sock', in order, and receives responses to all of them.  Fills in the
> - * 'error' member of each transaction with 0 if it was successful, otherwise
> - * with a positive errno value.  If 'reply' is nonnull, then it will be 
> filled
> - * with the reply if the message receives a detailed reply.  In other cases,
> - * i.e. where the request failed or had no reply beyond an indication of
> - * success, 'reply' will be cleared if it is nonnull.
> - *
> - * The caller is responsible for destroying each request and reply, and the
> - * transactions array itself.
> - *
> - * Before sending each message, this function will finalize nlmsg_len in each
> - * 'request' to match the ofpbuf's size,  set nlmsg_pid to 'sock''s pid, and
> - * initialize nlmsg_seq.
> - *
> - * Bare Netlink is an unreliable transport protocol.  This function layers
> - * reliable delivery and reply semantics on top of bare Netlink.  See
> - * nl_sock_transact() for some caveats.
> - */
> -void
> +static void
>  nl_sock_transact_multiple(struct nl_sock *sock,
>                            struct nl_transaction **transactions, size_t n)  { 
> @@ -611,47
> +592,7 @@ nl_sock_transact_multiple(struct nl_sock *sock,
>      }
>  }
> 
> -/* Sends 'request' to the kernel via 'sock' and waits for a response.  If
> - * successful, returns 0.  On failure, returns a positive errno value.
> - *
> - * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
> - * reply, which the caller is responsible for freeing with ofpbuf_delete(), 
> and
> - * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the 
> kernel's
> - * reply, if any, is discarded.
> - *
> - * Before the message is sent, nlmsg_len in 'request' will be finalized to
> - * match ofpbuf_size(msg), nlmsg_pid will be set to 'sock''s pid, and
> nlmsg_seq will
> - * be initialized, NLM_F_ACK will be set in nlmsg_flags.
> - *
> - * The caller is responsible for destroying 'request'.
> - *
> - * Bare Netlink is an unreliable transport protocol.  This function layers
> - * reliable delivery and reply semantics on top of bare Netlink.
> - *
> - * In Netlink, sending a request to the kernel is reliable enough, because 
> the
> - * kernel will tell us if the message cannot be queued (and we will in that
> - * case put it on the transmit queue and wait until it can be delivered).
> - *
> - * Receiving the reply is the real problem: if the socket buffer is full when
> - * the kernel tries to send the reply, the reply will be dropped.  However,
> the
> - * kernel sets a flag that a reply has been dropped.  The next call to recv
> - * then returns ENOBUFS.  We can then re-send the request.
> - *
> - * Caveats:
> - *
> - *      1. Netlink depends on sequence numbers to match up requests and
> - *         replies.  The sender of a request supplies a sequence number, and
> - *         the reply echos back that sequence number.
> - *
> - *         This is fine, but (1) some kernel netlink implementations are
> - *         broken, in that they fail to echo sequence numbers and (2) this
> - *         function will drop packets with non-matching sequence numbers, so
> - *         that only a single request can be usefully transacted at a time.
> - *
> - *      2. Resending the request causes it to be re-executed, so the request
> - *         needs to be idempotent.
> - */
> -int
> +static int
>  nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
>                   struct ofpbuf **replyp)  { @@ -1124,6 +1065,47 @@
> nl_pool_release(struct nl_sock *sock)
>      }
>  }
> 
> +/* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
> + * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.
> +If
> + * successful, returns 0.  On failure, returns a positive errno value.
> + *
> + * If 'replyp' is nonnull, then on success '*replyp' is set to the
> +kernel's
> + * reply, which the caller is responsible for freeing with
> +ofpbuf_delete(), and
> + * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the
> +kernel's
> + * reply, if any, is discarded.
> + *
> + * Before the message is sent, nlmsg_len in 'request' will be finalized
> +to
> + * match ofpbuf_size(msg), nlmsg_pid will be set to the pid of the
> +socket used
> + * for sending the request, and nlmsg_seq will be initialized.
> + *
> + * The caller is responsible for destroying 'request'.
> + *
> + * Bare Netlink is an unreliable transport protocol.  This function
> +layers
> + * reliable delivery and reply semantics on top of bare Netlink.
> + *
> + * In Netlink, sending a request to the kernel is reliable enough,
> +because the
> + * kernel will tell us if the message cannot be queued (and we will in
> +that
> + * case put it on the transmit queue and wait until it can be delivered).
> + *
> + * Receiving the reply is the real problem: if the socket buffer is
> +full when
> + * the kernel tries to send the reply, the reply will be dropped.
> +However, the
> + * kernel sets a flag that a reply has been dropped.  The next call to
> +recv
> + * then returns ENOBUFS.  We can then re-send the request.
> + *
> + * Caveats:
> + *
> + *      1. Netlink depends on sequence numbers to match up requests and
> + *         replies.  The sender of a request supplies a sequence number, and
> + *         the reply echos back that sequence number.
> + *
> + *         This is fine, but (1) some kernel netlink implementations are
> + *         broken, in that they fail to echo sequence numbers and (2) this
> + *         function will drop packets with non-matching sequence numbers, so
> + *         that only a single request can be usefully transacted at a time.
> + *
> + *      2. Resending the request causes it to be re-executed, so the request
> + *         needs to be idempotent.
> + */
>  int
>  nl_transact(int protocol, const struct ofpbuf *request,
>              struct ofpbuf **replyp)
> @@ -1143,6 +1125,26 @@ nl_transact(int protocol, const struct ofpbuf
> *request,
>      return error;
>  }
> 
> +/* Sends the 'request' member of the 'n' transactions in 'transactions'
> +on a
> + * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
> + * NETLINK_GENERIC), in order, and receives responses to all of them.
> +Fills in
> + * the 'error' member of each transaction with 0 if it was successful,
> + * otherwise with a positive errno value.  If 'reply' is nonnull, then
> +it will
> + * be filled with the reply if the message receives a detailed reply.
> +In other
> + * cases, i.e. where the request failed or had no reply beyond an
> +indication of
> + * success, 'reply' will be cleared if it is nonnull.
> + *
> + * The caller is responsible for destroying each request and reply, and
> +the
> + * transactions array itself.
> + *
> + * Before sending each message, this function will finalize nlmsg_len
> +in each
> + * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of
> +the socket
> + * used for the transaction, and initialize nlmsg_seq.
> + *
> + * Bare Netlink is an unreliable transport protocol.  This function
> +layers
> + * reliable delivery and reply semantics on top of bare Netlink.  See
> + * nl_transact() for some caveats.
> + */
>  void
>  nl_transact_multiple(int protocol,
>                       struct nl_transaction **transactions, size_t n) diff 
> --git
> a/lib/netlink-socket.h b/lib/netlink-socket.h index d53db4e..1450862 100644
> --- a/lib/netlink-socket.h
> +++ b/lib/netlink-socket.h
> @@ -19,17 +19,145 @@
> 
>  /* Netlink socket definitions.
>   *
> + * This header file defines functions for working with Netlink sockets.
> + Only
> + * Linux natively supports Netlink sockets, but Netlink is well suited
> + as a
> + * basis for extensible low-level protocols, so it can make sense to
> + implement
> + * a Netlink layer on other systems.  This doesn't have to be done in
> + exactly
> + * the same way as on Linux, as long as the implementation can support
> + the
> + * semantics that are important to Open vSwitch.  See "Usage concepts"
> + below
> + * for more information.
> + *
> + * For Netlink protocol definitions, see netlink-protocol.h.  For
> + helper
> + * functions for working with Netlink messages, see netlink.h.
> + *
> + *
> + * Usage concepts
> + * ==============
> + *
>   * Netlink is a datagram-based network protocol primarily for communication
> - * between user processes and the kernel, and mainly on Linux.  Netlink is
> - * specified in RFC 3549, "Linux Netlink as an IP Services Protocol".
> + * between user processes and the kernel.  Netlink is specified in RFC
> + 3549,
> + * "Linux Netlink as an IP Services Protocol".
>   *
>   * Netlink is not suitable for use in physical networks of heterogeneous
>   * machines because host byte order is used throughout.
>   *
> - * This header file defines functions for working with Netlink sockets, which
> - * are Linux-specific.  For Netlink protocol definitions, see
> - * netlink-protocol.h.  For helper functions for working with Netlink
> messages,
> - * see netlink.h.
> + * The AF_NETLINK socket namespace is subdivided into statically
> + numbered
> + * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the
> + third
> + * argument to the socket() function.  Maintaining the assigned numbers
> + became
> + * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC
> + protocol was
> + * introduced to map between human-readable names and dynamically
> + assigned
> + * numbers.  All recently introduced Netlink protocol messages in Linux
> + * (including all of the Open vSwitch specific messages) fall under
> + * NETLINK_GENERIC.  The Netlink library provides the
> + nl_lookup_genl_family()
> + * function for translating a Generic Netlink name to a number.  On
> + Linux, this
> + * queries the kernel Generic Netlink implementation, but on other
> + systems it
> + * might be easier to statically assign each of the names used by Open
> + vSwitch
> + * and then implement this function entirely in userspace.
> + *
> + * Each Netlink socket is distinguished by its Netlink PID, a 32-bit
> + integer
> + * that is analogous to a TCP or UDP port number.  The kernel has PID 0.
> + *
> + * Most Netlink messages manage a kernel table of some kind, e.g. the
> + kernel
> + * routing table, ARP table, etc.  Open vSwitch specific messages
> + manage tables
> + * of datapaths, ports within datapaths ("vports"), and flows within
> + * datapaths.  Open vSwitch also has messages related to network
> + packets
> + * received on vports, which aren't really a table.
> + *
> + * Datagram protocols over a physical network are typically unreliable:
> + in UDP,
> + * for example, messages can be dropped, delivered more than once, or
> + delivered
> + * out of order.  In Linux, Netlink does not deliver messages out of
> + order or
> + * multiple times.  In some cases it can drop messages, but the kernel
> + * indicates when a message has been dropped.  The description below of
> + each
> + * way Open vSwitch uses Netlink also explains how to work around
> + dropped
> + * messages.
> + *
> + * Open vSwitch uses Netlink in four characteristic ways:
> + *
> + *    1. Transactions.  A transaction is analogous to a system call, an 
> ioctl,
> + *       or an RPC: userspace sends a request to the kernel, which processes
> + *       the request synchronously and returns a reply to userspace.
> + *       (Sometimes there is no explicit reply, but even in that case 
> userspace
> + *       will receive an immediate reply if there is an error.)
> + *
> + *       nl_transact() is the primary interface for transactions over 
> Netlink.
> + *       This function doesn't take a socket as a parameter because sockets 
> do
> + *       not have any state related to transactions.
> + *
> + *       Netlink uses 16-bit "length" fields extensively, which effectively
> + *       limits requests and replies to 64 kB.  "Dumps" (see below) are one 
> way
> + *       to work around this limit for replies.
> + *
> + *       In the Linux implementation of Netlink transactions, replies can
> + *       sometimes be lost.  When this happens, nl_transact() automatically
> + *       executes the transaction again.  This means that it is important 
> that
> + *       transactions be idempotent, or that the client be prepared to 
> tolerate
> + *       that a transaction might actually execute more than once.
> + *
> + *       The Linux implementation can execute several transactions at the
> same
> + *       time more efficiently than individually.  nl_transact_multiple()
> + *       allows for this.  The semantics are no different from executing each
> + *       of the transactions individually with nl_transact().
> + *
> + *    2. Dumps.  A dump asks the kernel to provide all of the information in 
> a
> + *       table.  It consists of a request and a reply, where the reply 
> consists
> + *       of an arbitrary number of messages.  Each message in the reply is
> + *       limited to 64 kB, as is the request, but the total size of the reply
> + *       can be many times larger.
> + *
> + *       The reply to a dump is usually generated piece by piece, not
> + *       atomically.  The reply can represent an inconsistent snapshot of the
> + *       table.  This is especially likely if entries in the table were being
> + *       added or deleted or changing during the dump.
> + *
> + *       nl_dump_start() begins a dump based on the caller-provided request
> and
> + *       initializes a "struct nl_dump" to identify the dump.  Subsequent 
> calls
> + *       to nl_dump_next() then obtain the reply, one message at a time.
> + *       Usually, each message gives information about some entry in a table,
> + *       e.g. one flow in the Open vSwitch flow table, or one route in a
> + *       routing table.  nl_dump_done() ends the dump.
> + *
> + *       Linux implements dumps so that messages in a reply do not get lost.
> + *
> + *    3. Multicast subscriptions.  Most kernel Netlink implementations allow 
> a
> + *       process to monitor changes to its table, by subscribing to a Netlink
> + *       multicast group dedicated to that table.  Whenever the table's 
> content
> + *       changes (e.g. an entry is added or deleted or modified), the Netlink
> + *       implementation sends a message to all sockets that subscribe to its
> + *       multicast group notifying it of details of the change.  (This 
> doesn't
> + *       require much extra work by the Netlink implementer because the
> message
> + *       is generally identical to the one sent as a reply to the request 
> that
> + *       changed the table.)
> + *
> + *       nl_sock_join_mcgroup() subscribes a socket to a multicast group, and
> + *       nl_sock_recv() reads notifications.
> + *
> + *       If userspace doesn't read messages from a socket subscribed to a
> + *       multicast group quickly enough, then notification messages can pile 
> up
> + *       in the socket's receive buffer.  If this continues long enough, the
> + *       receive buffer will fill up and notifications will be lost.  In that
> + *       case, nl_sock_recv() will return ENOBUFS.  The client can then use a
> + *       dump to resynchronize with the table state.  (A simple
> implementation
> + *       of multicast groups might take advantage of this by simply returning
> + *       ENOBUFS whenever a table changes, without implementing actual
> + *       notifications.  This would cause lots of extra dumps, so it may not 
> be
> + *       suitable as a production implementation.)
> + *
> + *    4. Unicast subscriptions (Open vSwitch specific).  Userspace can assign
> + *       one or more Netlink PIDs to a vport as "upcall PIDs".  When a packet
> + *       received on the vport does not match any flow in its datapath's flow
> + *       table, the kernel hashes some of the packet's headers, uses the hash
> + *       to select one of the PIDs, and sends the packet (encapsulated in an
> + *       Open vSwitch Netlink message) to the socket with the selected PID.
> + *
> + *       The main reason to support multiple PIDs per vport is to increase
> + *       fairness, that is, to make it harder for a single high-flow-rate
> + *       sender to drown out lower rate sources.  Multiple PIDs per vport
> might
> + *       also improve packet handling latency or flow setup rate, but that is
> + *       not the main goal.
> + *
> + *       Old versions of the Linux kernel module supported only one PID per
> + *       vport, and userspace still copes with this, so a simple or early
> + *       implementation might only support one PID per vport too.
>   *
>   *
>   * Thread-safety
> @@ -72,8 +200,6 @@ int nl_sock_send(struct nl_sock *, const struct ofpbuf
> *, bool wait);  int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
>                       uint32_t nlmsg_seq, bool wait);  int 
> nl_sock_recv(struct nl_sock *,
> struct ofpbuf *, bool wait); -int nl_sock_transact(struct nl_sock *, const
> struct ofpbuf *request,
> -                     struct ofpbuf **replyp);
> 
>  int nl_sock_drain(struct nl_sock *);
> 
> @@ -98,9 +224,6 @@ struct nl_transaction {
>      int error;                  /* Positive errno value, 0 if no error. */
>  };
> 
> -void nl_sock_transact_multiple(struct nl_sock *,
> -                               struct nl_transaction **, size_t n);
> -
>  /* Transactions without an allocated socket. */  int nl_transact(int 
> protocol,
> const struct ofpbuf *request,
>                  struct ofpbuf **replyp);
> --
> 1.9.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://urldefense.proofpoint.com/v1/url?u=http://openvswitch.org/mailm
> an/listinfo/dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=pEkjsHfytv
> HEWufeZPpgqSOJMdMjuZPbesVsNhCUc0E%3D%0A&m=CdaSTTqubKMzJstlo
> 5pTuS80qOEIQkH3bgBl%2BtWwo1o%3D%0A&s=373eb6864d4f4e78519a32cb
> b3013932f1fac24cef0aacbf04f48f746ed65f49
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to