The commit is pushed to "branch-rh9-5.14.0-284.25.1.vz9.30.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-284.25.1.vz9.30.8
------>
commit 3202fa19f30e1e189addc784afc249fd9a9f1b59
Author: Alexey Kuznetsov <kuz...@virtuozzo.com>
Date:   Fri Oct 6 18:43:07 2023 +0800

    fuse: a protocol to reenable optimizations after replication finished
    
    When a replication starts we have to disable some optimizations,
    including kernel cs offload and fanout submissions. The problem is that
    our protocol does not notify clients and other CSes when replication
    finishes and optimizations could be reenabled.
    
    This patch suggests some protocol to allow to recover disabled
    optimizations. The idea is to add new flags (PCS_CS_IO_CLEAR)
    to write messages. If this write passes through replicating CS,
    it clears the flag. Then all the nodes, both client and other
    cses in chain, can see this flag in responses. If it is set
    this means replication is actually finished.
    
    Well, this is not a good idea to add such things to data path,
    but it is small thing. Advantage: we do not have to load mds
    with additional expensive protocol details.
    
    https://pmc.acronis.work/browse/VSTOR-54040
    
    Signed-off-by: Alexey Kuznetsov <kuz...@acronis.com>
    
    Feature: vStorage
---
 fs/fuse/kio/pcs/fuse_ktrace_prot.h |  2 +-
 fs/fuse/kio/pcs/pcs_cs.c           | 14 +++++++-------
 fs/fuse/kio/pcs/pcs_cs_accel.c     |  6 +++++-
 fs/fuse/kio/pcs/pcs_map.c          | 26 ++++++++++++++++++++++----
 fs/fuse/kio/pcs/pcs_map.h          |  6 ++++--
 5 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/fs/fuse/kio/pcs/fuse_ktrace_prot.h 
b/fs/fuse/kio/pcs/fuse_ktrace_prot.h
index dc1ade958980..3c985f92c8c6 100644
--- a/fs/fuse/kio/pcs/fuse_ktrace_prot.h
+++ b/fs/fuse/kio/pcs/fuse_ktrace_prot.h
@@ -36,7 +36,7 @@ struct fuse_tr_iotimes_hdr
        __u16   type;
        __u8    cses;
        __u8    __pad;
-       __u32   __pad1;
+       __u32   chid;
 };
 
 struct fuse_tr_iotimes_cs
diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
index 59b752dfdc8c..d298be995320 100644
--- a/fs/fuse/kio/pcs/pcs_cs.c
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -299,11 +299,7 @@ void cs_log_io_times(struct pcs_int_request * ireq, struct 
pcs_msg * resp, unsig
                        struct pcs_int_request * parent = 
ireq->iochunk.parent_N;
 
                        parent->iochunk.fo.io_times[idx].csid = 
resp->rpc->peer_id.val;
-                       /* XXX kio does not implement flow detection (for now) 
and does
-                        * not use flag PCS_CS_IO_SEQ. So, use it here to 
indicate
-                        * performed fanout.
-                        */
-                       parent->iochunk.fo.io_times[idx].misc = h->sync.misc | 
PCS_CS_IO_SEQ;
+                       parent->iochunk.fo.io_times[idx].misc = h->sync.misc;
                        parent->iochunk.fo.io_times[idx].ts_net = 
h->sync.ts_net;
                        parent->iochunk.fo.io_times[idx].ts_io = h->sync.ts_io;
 
@@ -340,6 +336,8 @@ void cs_log_io_times(struct pcs_int_request * ireq, struct 
pcs_msg * resp, unsig
                        th->ino = ireq->dentry->fileinfo.attr.id;
                        th->type = h->hdr.type;
                        th->cses = 1;
+                       th->__pad = 0;
+                       th->chid = (unsigned int)h->uid;
 
                        ch->csid = resp->rpc->peer_id.val;
                        ch->misc = h->sync.misc;
@@ -765,7 +763,9 @@ static void do_cs_submit(struct pcs_cs *cs, struct 
pcs_int_request *ireq)
        if ((ireq->dentry->fileinfo.attr.attrib & PCS_FATTR_IMMEDIATE_WRITE) || 
ireq->dentry->no_write_delay)
                ioh->sync.misc |= PCS_CS_IO_SYNC;
        if (ireq->flags & IREQ_F_FANOUT)
-               ioh->sync.misc = PCS_CS_IO_FANOUT;
+               ioh->sync.misc |= PCS_CS_IO_FANOUT;
+       if (pcs_cs_fanout(storage_version))
+               ioh->sync.misc |= PCS_CS_IO_CLEAR;
 
        msg->size = ioh->hdr.len;
        msg->rpc = NULL;
@@ -817,7 +817,7 @@ static inline int eligible_for_fanout(struct 
pcs_int_request * ireq)
        return (cs_enable_fanout && 
pcs_cs_fanout(atomic_read(&ireq->cc->storage_version)) &&
                ireq->iochunk.csl->nsrv <= PCS_MAP_MAX_FO_CS &&
                ireq->iochunk.cs_index + 1 < ireq->iochunk.csl->nsrv &&
-               !(ireq->iochunk.csl->flags & CS_FL_REPLICATING));
+               !test_bit(CSL_SF_HAS_REPLICATING, 
&ireq->iochunk.csl->state_flags));
 }
 
 void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
diff --git a/fs/fuse/kio/pcs/pcs_cs_accel.c b/fs/fuse/kio/pcs/pcs_cs_accel.c
index 3b75571e9b94..5d44862e46aa 100644
--- a/fs/fuse/kio/pcs/pcs_cs_accel.c
+++ b/fs/fuse/kio/pcs/pcs_cs_accel.c
@@ -415,6 +415,8 @@ static void __pcs_csa_final_completion(struct pcs_aio_req 
*areq)
                                th->ino = ireq->dentry->fileinfo.attr.id;
                                th->type = PCS_CS_READ_RESP;
                                th->cses = 1;
+                               th->__pad = 0;
+                               th->chid = (unsigned int)ireq->iochunk.map->id;
 
                                ch->csid = 
ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id.val | PCS_NODE_ALT_MASK;
                                ch->misc = ktime_to_us(ireq->ts_sent);
@@ -756,7 +758,6 @@ static void __complete_acr_work(struct work_struct * w)
 
                WARN_ON(!(ireq->flags & IREQ_F_FANOUT));
                parent->iochunk.fo.io_times[idx] = 
ireq->iochunk.acr.io_times[idx];
-               parent->iochunk.fo.io_times[idx].misc |= PCS_CS_IO_SEQ;
        } else {
                struct fuse_conn * fc = container_of(ireq->cc, struct 
pcs_fuse_cluster, cc)->fc;
 
@@ -782,6 +783,8 @@ static void __complete_acr_work(struct work_struct * w)
                                th->ino = ireq->dentry->fileinfo.attr.id;
                                th->type = PCS_CS_WRITE_AL_RESP;
                                th->cses = n;
+                               th->__pad = 0;
+                               th->chid = (unsigned int)ireq->iochunk.map->id;
 
                                for (i = 0; i < n; i++, ch++)
                                        *ch = ireq->iochunk.acr.io_times[i];
@@ -818,6 +821,7 @@ static void __pcs_csa_write_final_completion(struct 
pcs_accel_write_req *areq)
                th->ts_net = 0;
                th->ts_io = ktime_to_us(ktime_get()) - th->misc;
                th->misc &= PCS_CS_TS_MASK;
+               th->misc |= PCS_CS_IO_CLEAR | PCS_CS_IO_FANOUT;
        }
 
        csa_complete_acr(ireq);
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index 29ed4fab4290..59833ebc1db7 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -959,7 +959,7 @@ struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, 
struct pcs_cs_info *re
        atomic_set(&cs_list->refcnt, 1);
        atomic_set(&cs_list->seq_read_in_flight, 0);
        cs_list->read_index = -1;
-       cs_list->flags = 0;
+       cs_list->state_flags = 0;
        cs_list->serno = atomic64_inc_return(&css->csl_serno_gen);
        cs_list->blacklist = 0;
        cs_list->read_timeout = (read_tout * HZ) / 1000;
@@ -980,7 +980,7 @@ struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, 
struct pcs_cs_info *re
 
                if (cs_list->cs[i].info.flags & CS_FL_REPLICATING) {
                        __set_bit(i, &cs_list->blacklist);
-                       cs_list->flags |= CS_FL_REPLICATING;
+                       set_bit(CSL_SF_HAS_REPLICATING, &cs_list->state_flags);
                        cs_list->blacklist_expires = jiffies + 
PCS_REPLICATION_BLACKLIST_TIMEOUT;
                }
 
@@ -1005,7 +1005,7 @@ struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, 
struct pcs_cs_info *re
                cs->mds_flags = cs_list->cs[i].info.flags;
                if (cs->mds_flags & CS_FL_LOCAL) {
                        set_bit(CS_SF_LOCAL, &cs->state);
-                       cs_list->flags |= CSL_FL_HAS_LOCAL;
+                       set_bit(CSL_SF_HAS_LOCAL, &cs_list->state_flags);
                } else if (test_bit(CS_SF_LOCAL, &cs->state))
                        clear_bit(CS_SF_LOCAL, &cs->state);
 
@@ -1961,7 +1961,7 @@ static int pcs_cslist_submit_read(struct pcs_int_request 
*ireq, struct pcs_cs_li
                    (now > selected + PCS_MAP_MIN_REBALANCE_TIMEOUT &&
                     (!is_seq || get_io_locality(cc) < 0 ||
                      (!csl_seq &&
-                      !(test_bit(CS_SF_LOCAL, &cs->state)) && (csl->flags & 
CSL_FL_HAS_LOCAL))))) {
+                      !(test_bit(CS_SF_LOCAL, &cs->state)) && 
test_bit(CSL_SF_HAS_LOCAL, &csl->state_flags))))) {
                        i = -1;
                        WRITE_ONCE(csl->read_index, -1);
                }
@@ -2832,6 +2832,24 @@ static int commit_sync_info(struct pcs_int_request *req,
                        if (max_iolat < srec->sync.ts_io)
                                max_iolat = srec->sync.ts_io;
                }
+
+               /* If we got CLEAR response to a write to csl replication has 
already stopped */
+               if (test_bit(CSL_SF_HAS_REPLICATING, &csl->state_flags) && 
(h->sync.misc & PCS_CS_IO_CLEAR) &&
+                   (h->hdr.type == PCS_CS_WRITE_RESP || h->hdr.type == 
PCS_CS_WRITE_AL_RESP))
+                       clear_bit(CSL_SF_HAS_REPLICATING, &csl->state_flags);
+       } else {
+               /* In case we did successful read on would-be replicating CS 
resync the state */
+               if (test_bit(CSL_SF_HAS_REPLICATING, &csl->state_flags) &&
+                   test_bit(CS_SF_REPLICATING, 
&csl->cs[req->iochunk.cs_index].cslink.cs->state)) {
+                       int idx;
+                       clear_bit(CS_SF_REPLICATING, 
&csl->cs[req->iochunk.cs_index].cslink.cs->state);
+                       for (idx = csl->nsrv - 1; idx >= 0; idx++) {
+                               if (test_bit(CS_SF_REPLICATING, 
&csl->cs[idx].cslink.cs->state))
+                                       break;
+                       }
+                       if (idx < 0)
+                               clear_bit(CSL_SF_HAS_REPLICATING, 
&csl->state_flags);
+               }
        }
        pcs_fuse_stat_io_count(req, resp, max_iolat, net_lat);
        cs_log_io_times(req, resp, max_iolat);
diff --git a/fs/fuse/kio/pcs/pcs_map.h b/fs/fuse/kio/pcs/pcs_map.h
index 8cc7dfefc17e..6d771387d0cc 100644
--- a/fs/fuse/kio/pcs/pcs_map.h
+++ b/fs/fuse/kio/pcs/pcs_map.h
@@ -112,9 +112,11 @@ struct pcs_cs_list
        unsigned long           blacklist;              /* Atomic bit field */
        abs_time_t              blacklist_expires;      /* volatile blacklist 
stamp */
        abs_time_t              select_stamp;           /* volatile read hint 
stamp */
+       /* Semi-immutable. No locking, but can be modified unlocked */
+       unsigned long           state_flags;
+#define CSL_SF_HAS_LOCAL       0
+#define CSL_SF_HAS_REPLICATING 1
        /* members below are immutable accross cslist life time */
-#define CSL_FL_HAS_LOCAL       1
-       unsigned int            flags;
        u64                     serno;
        int                     read_timeout;
        int                     write_timeout;
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to