The the logical routers check only the "arp.op == 2" for ARP replies
and then use ARP replies to populate the logical router's ARP table.
If we continue to send ARP replies, which have different "arp.spa" and
"arp.sha", to logical router, the MAC_Binding table will continue to
increase. That may reduce system performance and cause instability
and crashes. This patches may fix this bug. IPv6 defines an NUD mechanism
that can help determine quickly whether neighbors have disconnected or
gone down. The OVN uses the same mechanism for both IPv4 and IPv6.
According to Section 7.3.2 in RFC 2461, A Neighbor Cache entry should
be in one of five states: INCOMPLETE, REACHABLE, STALE, DELAY, PROBE.
The OVN supports only INCOMPLETE, REACHABLE, PROBE in this patch.

You can find more information in RFC 2461 section 7.3.2.

* When logical router receive an ARP reply packet, we should check the
packet's "arp.tpa" and "arp.spa".

* The logical routers use a cache to store the ARP requests and set their
state INCOMPLETE. Only when logical routers get a corresponding ARP reply,
will the "ovn-controller" update the cache and set their state NUD_REACHABLE.

* More than ReachableTime milliseconds have elapsed since the last positive
confirmation was received that the forward path was functioning properly.
If no reachability confirmation is received within NEIGHBOUR_REACHABLE_TIME
seconds, send a Neighbor Solicitation and change the state to PROBE.

* According to Section 4.6 in RFC 5944, we should update the MAC address of 
existing
entries in the case of gratuitous ARP.

Signed-off-by: nickcooper-zhangtonghao <nickcooper-zhangtong...@opencloud.tech>
---
 ovn/controller/ovn-controller.c |   2 +-
 ovn/controller/pinctrl.c        | 271 ++++++++++++++++++++++++++--------------
 ovn/controller/pinctrl.h        |   2 +-
 ovn/lib/automake.mk             |   1 +
 ovn/northd/ovn-northd.c         |  35 ++++--
 5 files changed, 205 insertions(+), 106 deletions(-)

diff --git a/ovn/controller/ovn-controller.c b/ovn/controller/ovn-controller.c
index 66a364f..b59550f 100644
--- a/ovn/controller/ovn-controller.c
+++ b/ovn/controller/ovn-controller.c
@@ -477,7 +477,7 @@ main(int argc, char *argv[])
 
         if (br_int) {
             ofctrl_wait();
-            pinctrl_wait(&ctx);
+            pinctrl_wait();
         }
         ovsdb_idl_loop_commit_and_wait(&ovnsb_idl_loop);
         ovsdb_idl_loop_commit_and_wait(&ovs_idl_loop);
diff --git a/ovn/controller/pinctrl.c b/ovn/controller/pinctrl.c
index 2737467..177d0d4 100644
--- a/ovn/controller/pinctrl.c
+++ b/ovn/controller/pinctrl.c
@@ -38,6 +38,7 @@
 #include "ovn/actions.h"
 #include "ovn/lib/logical-fields.h"
 #include "ovn/lib/ovn-dhcp.h"
+#include "ovn/lib/ovn-neighbour.h"
 #include "ovn/lib/ovn-util.h"
 #include "poll-loop.h"
 #include "rconn.h"
@@ -57,13 +58,17 @@ static unsigned int conn_seq_no;
 static void pinctrl_handle_put_mac_binding(const struct flow *md,
                                            const struct flow *headers,
                                            bool is_arp);
-static void init_put_mac_bindings(void);
-static void destroy_put_mac_bindings(void);
+static void init_neighbour_caches(void);
+static void destroy_neighbour_caches(void);
 static void run_put_mac_bindings(struct controller_ctx *,
                                  const struct lport_index *lports);
-static void wait_put_mac_bindings(struct controller_ctx *);
-static void flush_put_mac_bindings(void);
+static void run_update_state_neighbour_cache(void);
 
+static void flush_neighbour_caches(void);
+static void pinctrl_handle_neighbour_cache(uint32_t dp_key,
+                                           uint32_t port_key,
+                                           const char *ip_s,
+                                           bool is_arp);
 static void init_send_garps(void);
 static void destroy_send_garps(void);
 static void send_garp_wait(void);
@@ -84,7 +89,7 @@ pinctrl_init(void)
 {
     swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP13_VERSION);
     conn_seq_no = 0;
-    init_put_mac_bindings();
+    init_neighbour_caches();
     init_send_garps();
 }
 
@@ -185,6 +190,17 @@ pinctrl_handle_arp(const struct flow *ip_flow, const 
struct match *md,
     enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
     queue_msg(ofputil_encode_packet_out(&po, proto));
 
+    /* The logical routers use a cache to store the ARP requests and
+     * set their state NUD_INCOMPLETE. Only when logical routers get
+     * a corresponding ARP reply, will the 'ovn-controller' update
+     * the cache and set their state NUD_REACHABLE. */
+    char ip_s[INET_ADDRSTRLEN];
+    uint32_t dp_key = ntohll(md->flow.metadata);
+    uint32_t port_key = md->flow.regs[MFF_LOG_OUTPORT - MFF_REG0];
+
+    inet_ntop(AF_INET, &ip_flow->nw_dst, ip_s, sizeof ip_s);
+    pinctrl_handle_neighbour_cache(dp_key, port_key, ip_s, true);
+
 exit:
     dp_packet_uninit(&packet);
     ofpbuf_uninit(&ofpacts);
@@ -764,7 +780,6 @@ pinctrl_run(struct controller_ctx *ctx, const struct 
lport_index *lports,
         if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
             pinctrl_setup(swconn);
             conn_seq_no = rconn_get_connection_seqno(swconn);
-            flush_put_mac_bindings();
         }
 
         /* Process a limited number of messages per call. */
@@ -783,14 +798,14 @@ pinctrl_run(struct controller_ctx *ctx, const struct 
lport_index *lports,
         }
     }
 
+    run_update_state_neighbour_cache();
     run_put_mac_bindings(ctx, lports);
     send_garp_run(br_int, chassis_id, lports, local_datapaths);
 }
 
 void
-pinctrl_wait(struct controller_ctx *ctx)
+pinctrl_wait(void)
 {
-    wait_put_mac_bindings(ctx);
     rconn_run_wait(swconn);
     rconn_recv_wait(swconn);
     send_garp_wait();
@@ -800,65 +815,120 @@ void
 pinctrl_destroy(void)
 {
     rconn_destroy(swconn);
-    destroy_put_mac_bindings();
+    destroy_neighbour_caches();
     destroy_send_garps();
 }
 
 /* Implementation of the "put_arp" and "put_nd" OVN actions.  These
  * actions send a packet to ovn-controller, using the flow as an API
  * (see actions.h for details).  This code implements the actions by
- * updating the MAC_Binding table in the southbound database.
- *
- * This code could be a lot simpler if the database could always be updated,
- * but in fact we can only update it when ctx->ovnsb_idl_txn is nonnull.  Thus,
- * we buffer up a few put_mac_bindings (but we don't keep them longer
- * than 1 second) and apply them whenever a database transaction is
- * available. */
-
-/* Buffered "put_mac_binding" operation. */
-struct put_mac_binding {
-    struct hmap_node hmap_node; /* In 'put_mac_bindings'. */
+ * updating the MAC_Binding table in the southbound database. */
+static struct hmap neighbour_caches;
 
-    long long int timestamp;    /* In milliseconds. */
-
-    /* Key. */
-    uint32_t dp_key;
-    uint32_t port_key;
-    char ip_s[INET6_ADDRSTRLEN + 1];
-
-    /* Value. */
-    struct eth_addr mac;
-};
+static void
+init_neighbour_caches(void)
+{
+    neighbour_caches_init(&neighbour_caches);
+}
 
-/* Contains "struct put_mac_binding"s. */
-static struct hmap put_mac_bindings;
+static void
+destroy_neighbour_caches(void)
+{
+    flush_neighbour_caches();
+    hmap_destroy(&neighbour_caches);
+}
 
 static void
-init_put_mac_bindings(void)
+pinctrl_handle_neighbour_cache(uint32_t dp_key, uint32_t port_key,
+                            const char *nw_dst, bool is_arp)
 {
-    hmap_init(&put_mac_bindings);
+    struct neighbour_cache *nc = neighbour_cache_find(&neighbour_caches,
+                                        dp_key, port_key, nw_dst);
+    if (!nc) {
+        if (hmap_count(&neighbour_caches) >= NEIGHBOUR_CACHE_MAX) {
+            VLOG_INFO("The neighbour cache table is full.  Solicitation "
+                    "request will be dropped.");
+            return;
+        }
+
+        struct neighbour_cache cache = {
+            .type = is_arp ? OVN_NEIGHBOUR_TYPE_ARP: OVN_NEIGHBOUR_TYPE_ND,
+            .nud_state = NUD_INCOMPLETE,
+            .dp_key = dp_key,
+            .port_key = port_key,
+        };
+
+        ovs_strlcpy(cache.nw_dst, nw_dst, sizeof cache.nw_dst);
+        neighbour_cache_add(&neighbour_caches, &cache);
+        return;
+    }
+    nc->timestamp = time_msec();
 }
 
 static void
-destroy_put_mac_bindings(void)
+send_solicitation_request(const struct neighbour_cache * nc)
 {
-    flush_put_mac_bindings();
-    hmap_destroy(&put_mac_bindings);
+    /* Compose a ARP request packet. */
+    uint64_t packet_stub[128 / 8];
+    struct dp_packet packet;
+    ovs_be32 nw_dst, nw_src;
+    dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub);
+
+    inet_pton(AF_INET, nc->nw_dst, &nw_dst);
+    inet_pton(AF_INET, nc->nw_src, &nw_src);
+
+    compose_arp(&packet, ARP_OP_REQUEST, nc->eth_src, nc->eth_dst,
+             true, nw_src, nw_dst);
+
+    /* Compose actions. */
+    uint64_t ofpacts_stub[4096 / 8];
+    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
+    enum ofp_version version = rconn_get_version(swconn);
+    ofpact_put_OUTPUT(&ofpacts)->port = OFPP_NORMAL;
+
+    struct ofputil_packet_out po = {
+        .packet = dp_packet_data(&packet),
+        .packet_len = dp_packet_size(&packet),
+        .buffer_id = UINT32_MAX,
+        .in_port = OFPP_CONTROLLER,
+        .ofpacts = ofpacts.data,
+        .ofpacts_len = ofpacts.size,
+    };
+
+    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
+    queue_msg(ofputil_encode_packet_out(&po, proto));
+
+    dp_packet_uninit(&packet);
+    ofpbuf_uninit(&ofpacts);
 }
 
-static struct put_mac_binding *
-pinctrl_find_put_mac_binding(uint32_t dp_key, uint32_t port_key,
-                             const char *ip_s, uint32_t hash)
+static void
+run_update_state_neighbour_cache(void)
 {
-    struct put_mac_binding *pa;
-    HMAP_FOR_EACH_WITH_HASH (pa, hmap_node, hash, &put_mac_bindings) {
-        if (pa->dp_key == dp_key
-            && pa->port_key == port_key
-            && !strcmp(pa->ip_s, ip_s)) {
-            return pa;
+    struct neighbour_cache *nc;
+    HMAP_FOR_EACH (nc, hmap_node, &neighbour_caches) {
+        /* If the ARP/NS request has been send NEIGHBOUR_RETRANS_TIME
+         * seconds, and the reply is not received, set the state of the
+         * neighbour cache to NUD_FAILED. The cache whose state is
+         * NUD_FAILED, will be removed soon. */
+        if (nc->nud_state == NUD_INCOMPLETE &&
+                time_msec() > nc->timestamp + NEIGHBOUR_INCOMPLETE_TIME) {
+            nc->nud_state = NUD_FAILED;
+        } else if (nc->nud_state == NUD_REACHABLE &&
+                time_msec() > nc->timestamp + NEIGHBOUR_REACHABLE_TIME) {
+            /* Skip NUD_DELAY state. */
+            nc->nud_state = NUD_PROBE;
+            nc->timestamp = time_msec();
+        } else if (nc->nud_state == NUD_PROBE) {
+            if (time_msec() > nc->timestamp + NEIGHBOUR_RETRANS_TIME) {
+                nc->nud_state = NUD_FAILED;
+            } else {
+                /* A reachability confirmation is actively sought by
+                 * retransmitting Neighbor Solicitations. */
+                send_solicitation_request(nc);
+            }
         }
     }
-    return NULL;
 }
 
 static void
@@ -869,6 +939,12 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
     uint32_t port_key = md->regs[MFF_LOG_INPORT - MFF_REG0];
     char ip_s[INET6_ADDRSTRLEN];
 
+    if (headers->nw_proto == ARP_OP_REQUEST &&
+            headers->nw_src != headers->nw_dst && is_arp) {
+        /* This is ARP request but not a GARP.*/
+        return;
+    }
+
     if (is_arp) {
         ovs_be32 ip = htonl(md->regs[0]);
         inet_ntop(AF_INET, &ip, ip_s, sizeof(ip_s));
@@ -876,59 +952,67 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
         ovs_be128 ip6 = hton128(flow_get_xxreg(md, 0));
         inet_ntop(AF_INET6, &ip6, ip_s, sizeof(ip_s));
     }
-    uint32_t hash = hash_string(ip_s, hash_2words(dp_key, port_key));
-    struct put_mac_binding *pmb
-        = pinctrl_find_put_mac_binding(dp_key, port_key, ip_s, hash);
-    if (!pmb) {
-        if (hmap_count(&put_mac_bindings) >= 1000) {
-            COVERAGE_INC(pinctrl_drop_put_mac_binding);
-            return;
-        }
 
-        pmb = xmalloc(sizeof *pmb);
-        hmap_insert(&put_mac_bindings, &pmb->hmap_node, hash);
-        pmb->dp_key = dp_key;
-        pmb->port_key = port_key;
-        ovs_strlcpy(pmb->ip_s, ip_s, sizeof pmb->ip_s);
+    /* Check to see whether the corresponding ARP request exists in the cache.
+     * If we don't find it, return directly. */
+    struct neighbour_cache *nc = neighbour_cache_find(&neighbour_caches,
+                                        dp_key, port_key, ip_s);
+    if (!nc) {
+        return;
+    }
+
+    /* When your host receives a solicitation reply in answer to a solicitation
+     * request it previously sent out, it means that the neighbour received the
+     * request and was able to send back a reply; this in turn means that 
either
+     * it already had your L2 address or it learned your address from your 
request.
+     * It also means that there is a working path in both directions. */
+    nc->nud_state = NUD_REACHABLE;
+    nc->timestamp = time_msec();
+    nc->eth_dst = headers->dl_src;
+    if (headers->nw_proto != ARP_OP_REQUEST) {
+        nc->eth_src = headers->dl_dst;
+        if (is_arp) {
+            inet_ntop(AF_INET, &headers->nw_dst, ip_s, sizeof ip_s);
+            ovs_strlcpy(nc->nw_src, ip_s, sizeof nc->nw_src);
+        } else {
+            /* ND codes. */
+        }
     }
-    pmb->timestamp = time_msec();
-    pmb->mac = headers->dl_src;
 }
 
 static void
 run_put_mac_binding(struct controller_ctx *ctx,
                     const struct lport_index *lports,
-                    const struct put_mac_binding *pmb)
+                    const struct neighbour_cache *nc)
 {
-    if (time_msec() > pmb->timestamp + 1000) {
-        return;
-    }
-
     /* Convert logical datapath and logical port key into lport. */
     const struct sbrec_port_binding *pb
-        = lport_lookup_by_key(lports, pmb->dp_key, pmb->port_key);
+        = lport_lookup_by_key(lports, nc->dp_key, nc->port_key);
     if (!pb) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
 
         VLOG_WARN_RL(&rl, "unknown logical port with datapath %"PRIu32" "
-                     "and port %"PRIu32, pmb->dp_key, pmb->port_key);
+                     "and port %"PRIu32, nc->dp_key, nc->port_key);
         return;
     }
 
     /* Convert ethernet argument to string form for database. */
     char mac_string[ETH_ADDR_STRLEN + 1];
     snprintf(mac_string, sizeof mac_string,
-             ETH_ADDR_FMT, ETH_ADDR_ARGS(pmb->mac));
+             ETH_ADDR_FMT, ETH_ADDR_ARGS(nc->eth_dst));
 
     /* Check for an update an existing IP-MAC binding for this logical
      * port.
      *
      * XXX This is not very efficient. */
-    const struct sbrec_mac_binding *b;
-    SBREC_MAC_BINDING_FOR_EACH (b, ctx->ovnsb_idl) {
+    const struct sbrec_mac_binding *b, *n;
+    SBREC_MAC_BINDING_FOR_EACH_SAFE (b, n, ctx->ovnsb_idl) {
         if (!strcmp(b->logical_port, pb->logical_port)
-            && !strcmp(b->ip, pmb->ip_s)) {
-            if (strcmp(b->mac, mac_string)) {
+            && !strcmp(b->ip, nc->nw_dst)) {
+            if (nc->nud_state & NUD_FAILED) {
+                sbrec_mac_binding_delete(b);
+            } else if ((nc->nud_state & NUD_VALID)
+                    && strcmp(b->mac, mac_string)) {
                 sbrec_mac_binding_set_mac(b, mac_string);
             }
             return;
@@ -936,11 +1020,13 @@ run_put_mac_binding(struct controller_ctx *ctx,
     }
 
     /* Add new IP-MAC binding for this logical port. */
-    b = sbrec_mac_binding_insert(ctx->ovnsb_idl_txn);
-    sbrec_mac_binding_set_logical_port(b, pb->logical_port);
-    sbrec_mac_binding_set_ip(b, pmb->ip_s);
-    sbrec_mac_binding_set_mac(b, mac_string);
-    sbrec_mac_binding_set_datapath(b, pb->datapath);
+    if (nc->nud_state & NUD_VALID) {
+        b = sbrec_mac_binding_insert(ctx->ovnsb_idl_txn);
+        sbrec_mac_binding_set_logical_port(b, pb->logical_port);
+        sbrec_mac_binding_set_ip(b, nc->nw_dst);
+        sbrec_mac_binding_set_mac(b, mac_string);
+        sbrec_mac_binding_set_datapath(b, pb->datapath);
+    }
 }
 
 static void
@@ -951,30 +1037,25 @@ run_put_mac_bindings(struct controller_ctx *ctx,
         return;
     }
 
-    const struct put_mac_binding *pmb;
-    HMAP_FOR_EACH (pmb, hmap_node, &put_mac_bindings) {
-        run_put_mac_binding(ctx, lports, pmb);
+    struct  neighbour_cache *nc, *next;
+    HMAP_FOR_EACH_SAFE (nc, next, hmap_node, &neighbour_caches) {
+        run_put_mac_binding(ctx, lports, nc);
+        if (nc->nud_state & NUD_FAILED) {
+            hmap_remove(&neighbour_caches, &nc->hmap_node);
+            free(nc);
+        }
     }
-    flush_put_mac_bindings();
 }
 
 static void
-wait_put_mac_bindings(struct controller_ctx *ctx)
+flush_neighbour_caches(void)
 {
-    if (ctx->ovnsb_idl_txn && !hmap_is_empty(&put_mac_bindings)) {
-        poll_immediate_wake();
+    struct neighbour_cache *nc;
+    HMAP_FOR_EACH_POP (nc, hmap_node, &neighbour_caches) {
+        free(nc);
     }
 }
 
-static void
-flush_put_mac_bindings(void)
-{
-    struct put_mac_binding *pmb;
-    HMAP_FOR_EACH_POP (pmb, hmap_node, &put_mac_bindings) {
-        free(pmb);
-    }
-}
-
 /*
  * Send gratuitous ARP for vif on localnet.
  *
diff --git a/ovn/controller/pinctrl.h b/ovn/controller/pinctrl.h
index 553eace..c8bfe29 100644
--- a/ovn/controller/pinctrl.h
+++ b/ovn/controller/pinctrl.h
@@ -31,7 +31,7 @@ void pinctrl_run(struct controller_ctx *, const struct 
lport_index *,
                  const struct ovsrec_bridge *,
                  const char *chassis_id,
                  struct hmap *local_datapaths);
-void pinctrl_wait(struct controller_ctx *);
+void pinctrl_wait(void);
 void pinctrl_destroy(void);
 
 #endif /* ovn/pinctrl.h */
diff --git a/ovn/lib/automake.mk b/ovn/lib/automake.mk
index 7f26de2..0e6cfcc 100644
--- a/ovn/lib/automake.mk
+++ b/ovn/lib/automake.mk
@@ -8,6 +8,7 @@ ovn_lib_libovn_la_SOURCES = \
        ovn/lib/expr.c \
        ovn/lib/lex.c \
        ovn/lib/ovn-dhcp.h \
+       ovn/lib/ovn-neighbour.h \
        ovn/lib/ovn-util.c \
        ovn/lib/ovn-util.h \
        ovn/lib/logical-fields.c \
diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index 625937d..d139617 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -3167,11 +3167,6 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap 
*ports,
                       "ip4.dst == 0.0.0.0/8",
                       "drop;");
 
-        /* ARP reply handling.  Use ARP replies to populate the logical
-         * router's ARP table. */
-        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
-                      "put_arp(inport, arp.spa, arp.sha);");
-
         /* Drop Ethernet local broadcast.  By definition this traffic should
          * not be forwarded.*/
         ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 50,
@@ -3239,16 +3234,38 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap 
*ports,
                           ds_cstr(&match), ds_cstr(&actions));
         }
 
-        /* ARP reply.  These flows reply to ARP requests for the router's own
-         * IP address. */
         for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
+            /* ARP reply handling.  Use ARP replies to populate the logical
+             * router's ARP table. The logical router doesn't support proxy 
ARP */
             ds_clear(&match);
             ds_put_format(&match,
-                          "inport == %s && arp.tpa == %s && arp.op == 1",
-                          op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s);
+                          "inport == %s && arp.op == 2 "
+                          "&& arp.tpa == %s "
+                          "&& arp.spa == %s/%d",
+                          op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s,
+                          op->lrp_networks.ipv4_addrs[i].network_s,
+                          op->lrp_networks.ipv4_addrs[i].plen);
+
+            ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
+                        ds_cstr(&match), "put_arp(inport, arp.spa, arp.sha);");
+
+            /* ARP reply and GARP request handling.  These flows reply to
+             * ARP requests for the router's IP address. Use GARP requests
+             * to populate the logical router's ARP table. */
+            ds_clear(&match);
+            ds_put_format(&match, "inport == %s && arp.op == 1 "
+                    "&& arp.spa != %s "
+                    "&& arp.spa == %s/%d "
+                    "&& arp.tpa == %s/%d",
+                    op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s,
+                    op->lrp_networks.ipv4_addrs[i].network_s,
+                    op->lrp_networks.ipv4_addrs[i].plen,
+                    op->lrp_networks.ipv4_addrs[i].network_s,
+                    op->lrp_networks.ipv4_addrs[i].plen);
 
             ds_clear(&actions);
             ds_put_format(&actions,
+                "put_arp(inport, arp.spa, arp.sha); "
                 "eth.dst = eth.src; "
                 "eth.src = %s; "
                 "arp.op = 2; /* ARP reply */ "
-- 
1.8.3.1

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to