Author: trociny
Date: Tue Dec 10 19:56:26 2013
New Revision: 259191
URL: http://svnweb.freebsd.org/changeset/base/259191

Log:
  For memsync replication, hio_countdown is used not only as an
  indication when a request can be moved to done queue, but also for
  detecting the current state of memsync request.
  
  This approach has problems, e.g. leaking a request if memsynk ack from
  the secondary failed, or racy usage of write_complete, which should be
  called only once per write request, but for memsync can be entered by
  local_send_thread and ggate_send_thread simultaneously.
  
  So the following approach is implemented instead:
  
  1) Use hio_countdown only for counting components we waiting to
     complete, i.e. initially it is always 2 for any replication mode.
  
  2) To distinguish between "memsync ack" and "memsync fin" responses
     from the secondary, add and use hio_memsyncacked field.
  
  3) write_complete() in component threads is called only before
     releasing hio_countdown (i.e. before the hio may be returned to the
     done queue).
  
  4) Add and use hio_writecount refcounter to detect when
     write_complete() can be called in memsync case.
  
  Reported by:  Pete French petefrench ingresso.co.uk
  Tested by:    Pete French petefrench ingresso.co.uk
  MFC after:    2 weeks

Modified:
  head/sbin/hastd/primary.c

Modified: head/sbin/hastd/primary.c
==============================================================================
--- head/sbin/hastd/primary.c   Tue Dec 10 19:48:48 2013        (r259190)
+++ head/sbin/hastd/primary.c   Tue Dec 10 19:56:26 2013        (r259191)
@@ -94,6 +94,15 @@ struct hio {
         */
        bool                     hio_done;
        /*
+        * Number of components we are still waiting before sending write
+        * completion ack to GEOM Gate. Used for memsync.
+        */
+       refcnt_t                 hio_writecount;
+       /*
+        * Memsync request was acknowleged by remote.
+        */
+       bool                     hio_memsyncacked;
+       /*
         * Remember replication from the time the request was initiated,
         * so we won't get confused when replication changes on reload.
         */
@@ -231,6 +240,9 @@ static pthread_mutex_t metadata_lock;
 #define        ISSYNCREQ(hio)          ((hio)->hio_ggio.gctl_unit == -1)
 #define        SYNCREQDONE(hio)        do { (hio)->hio_ggio.gctl_unit = -2; } 
while (0)
 #define        ISSYNCREQDONE(hio)      ((hio)->hio_ggio.gctl_unit == -2)
+#define ISMEMSYNCWRITE(hio)    \
+       (((hio)->hio_replication == HAST_REPLICATION_MEMSYNC &&         \
+       (hio)->hio_ggio.gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)))
 
 static struct hast_resource *gres;
 
@@ -1344,6 +1356,10 @@ ggate_recv_thread(void *arg)
                        } else {
                                mtx_unlock(&res->hr_amp_lock);
                        }
+                       if (hio->hio_replication == HAST_REPLICATION_MEMSYNC) {
+                               hio->hio_memsyncacked = false;
+                               refcnt_init(&hio->hio_writecount, ncomps);
+                       }
                        break;
                case BIO_DELETE:
                        res->hr_stat_delete++;
@@ -1354,13 +1370,7 @@ ggate_recv_thread(void *arg)
                }
                pjdlog_debug(2,
                    "ggate_recv: (%p) Moving request to the send queues.", hio);
-               if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
-                   ggio->gctl_cmd == BIO_WRITE) {
-                       /* Each remote request needs two responses in memsync. 
*/
-                       refcnt_init(&hio->hio_countdown, ncomps + 1);
-               } else {
-                       refcnt_init(&hio->hio_countdown, ncomps);
-               }
+               refcnt_init(&hio->hio_countdown, ncomps);
                for (ii = ncomp; ii < ncomps; ii++)
                        QUEUE_INSERT1(hio, send, ii);
        }
@@ -1470,42 +1480,13 @@ local_send_thread(void *arg)
                        }
                        break;
                }
-
-               if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
-                   ggio->gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
-                       if (refcnt_release(&hio->hio_countdown) > 0)
-                               continue;
-               } else {
-                       /*
-                        * Depending on hio_countdown value, requests finished
-                        * in the following order:
-                        * 0: remote memsync, remote final, local write
-                        * 1: remote memsync, local write, (remote final)
-                        * 2: local write, (remote memsync), (remote final)
-                        */
-                       switch (refcnt_release(&hio->hio_countdown)) {
-                       case 0:
-                               /*
-                                * Local write finished as last.
-                                */
-                               break;
-                       case 1:
-                               /*
-                                * Local write finished after remote memsync
-                                * reply arrvied. We can complete the write now.
-                                */
-                               if (hio->hio_errors[0] == 0)
-                                       write_complete(res, hio);
-                               continue;
-                       case 2:
-                               /*
-                                * Local write finished as first.
-                                */
-                               continue;
-                       default:
-                               PJDLOG_ABORT("Invalid hio_countdown.");
+               if (ISMEMSYNCWRITE(hio)) {
+                       if (refcnt_release(&hio->hio_writecount) == 0) {
+                               write_complete(res, hio);
                        }
                }
+               if (refcnt_release(&hio->hio_countdown) > 0)
+                       continue;
                if (ISSYNCREQ(hio)) {
                        mtx_lock(&sync_lock);
                        SYNCREQDONE(hio);
@@ -1627,10 +1608,8 @@ remote_send_thread(void *arg)
                nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq");
                nv_add_uint64(nv, offset, "offset");
                nv_add_uint64(nv, length, "length");
-               if (hio->hio_replication == HAST_REPLICATION_MEMSYNC &&
-                   ggio->gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) {
+               if (ISMEMSYNCWRITE(hio))
                        nv_add_uint8(nv, 1, "memsync");
-               }
                if (nv_error(nv) != 0) {
                        hio->hio_errors[ncomp] = nv_error(nv);
                        pjdlog_debug(2,
@@ -1709,8 +1688,12 @@ done_queue:
                        } else {
                                mtx_unlock(&res->hr_amp_lock);
                        }
-                       if (hio->hio_replication == HAST_REPLICATION_MEMSYNC)
-                               (void)refcnt_release(&hio->hio_countdown);
+                       if (ISMEMSYNCWRITE(hio)) {
+                               if (refcnt_release(&hio->hio_writecount) == 0) {
+                                       if (hio->hio_errors[0] == 0)
+                                               write_complete(res, hio);
+                               }
+                       }
                }
                if (refcnt_release(&hio->hio_countdown) > 0)
                        continue;
@@ -1768,6 +1751,7 @@ remote_recv_thread(void *arg)
                            hio_next[ncomp]);
                        hio_recv_list_size[ncomp]--;
                        mtx_unlock(&hio_recv_list_lock[ncomp]);
+                       hio->hio_errors[ncomp] = ENOTCONN;
                        goto done_queue;
                }
                if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
@@ -1841,82 +1825,34 @@ remote_recv_thread(void *arg)
                hio->hio_errors[ncomp] = 0;
                nv_free(nv);
 done_queue:
-               if (hio->hio_replication != HAST_REPLICATION_MEMSYNC ||
-                   hio->hio_ggio.gctl_cmd != BIO_WRITE || ISSYNCREQ(hio)) {
-                       if (refcnt_release(&hio->hio_countdown) > 0)
-                               continue;
-               } else {
-                       /*
-                        * Depending on hio_countdown value, requests finished
-                        * in the following order:
-                        *
-                        * 0: local write, remote memsync, remote final
-                        * or
-                        * 0: remote memsync, local write, remote final
-                        *
-                        * 1: local write, remote memsync, (remote final)
-                        * or
-                        * 1: remote memsync, remote final, (local write)
-                        *
-                        * 2: remote memsync, (local write), (remote final)
-                        * or
-                        * 2: remote memsync, (remote final), (local write)
-                        */
-                       switch (refcnt_release(&hio->hio_countdown)) {
-                       case 0:
-                               /*
-                                * Remote final reply arrived.
-                                */
-                               PJDLOG_ASSERT(!memsyncack);
-                               break;
-                       case 1:
-                               if (memsyncack) {
-                                       /*
-                                        * Local request already finished, so we
-                                        * can complete the write.
-                                        */
+               if (ISMEMSYNCWRITE(hio)) {
+                       if (!hio->hio_memsyncacked) {
+                               PJDLOG_ASSERT(memsyncack ||
+                                   hio->hio_errors[ncomp] != 0);
+                               /* Remote ack arrived. */
+                               if (refcnt_release(&hio->hio_writecount) == 0) {
                                        if (hio->hio_errors[0] == 0)
                                                write_complete(res, hio);
-                                       /*
-                                        * We still need to wait for final
-                                        * remote reply.
-                                        */
+                               }
+                               hio->hio_memsyncacked = true;
+                               if (hio->hio_errors[ncomp] == 0) {
                                        pjdlog_debug(2,
-                                           "remote_recv: (%p) Moving request 
back to the recv queue.",
-                                           hio);
+                                           "remote_recv: (%p) Moving request "
+                                           "back to the recv queue.", hio);
                                        mtx_lock(&hio_recv_list_lock[ncomp]);
                                        TAILQ_INSERT_TAIL(&hio_recv_list[ncomp],
                                            hio, hio_next[ncomp]);
                                        hio_recv_list_size[ncomp]++;
                                        mtx_unlock(&hio_recv_list_lock[ncomp]);
-                               } else {
-                                       /*
-                                        * Remote final reply arrived before
-                                        * local write finished.
-                                        * Nothing to do in such case.
-                                        */
+                                       continue;
                                }
-                               continue;
-                       case 2:
-                               /*
-                                * We received remote memsync reply even before
-                                * local write finished.
-                                */
-                               PJDLOG_ASSERT(memsyncack);
-
-                               pjdlog_debug(2,
-                                   "remote_recv: (%p) Moving request back to 
the recv queue.",
-                                   hio);
-                               mtx_lock(&hio_recv_list_lock[ncomp]);
-                               TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio,
-                                   hio_next[ncomp]);
-                               hio_recv_list_size[ncomp]++;
-                               mtx_unlock(&hio_recv_list_lock[ncomp]);
-                               continue;
-                       default:
-                               PJDLOG_ABORT("Invalid hio_countdown.");
+                       } else {
+                               PJDLOG_ASSERT(!memsyncack);
+                               /* Remote final reply arrived. */
                        }
                }
+               if (refcnt_release(&hio->hio_countdown) > 0)
+                       continue;
                if (ISSYNCREQ(hio)) {
                        mtx_lock(&sync_lock);
                        SYNCREQDONE(hio);
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to