Hi Liu,

can you please check your patches with checkpatch.pl before sending?

i will appreciate a lot.
Thank you.



(khorenko@alma9)/ttt/git/vzkernel:git show > /tmp/diff-1

(khorenko@alma9)/ttt/git/vzkernel:scripts/checkpatch.pl /tmp/diff-1
WARNING: 'timout' may be misspelled - perhaps 'timeout'?
#18:
    As a result, these requests will be killed due to calendar timout.
                                                               ^^^^^^

WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per 
line)
#19:
    However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ

WARNING: 'recevied' may be misspelled - perhaps 'received'?
#27:
    recevied and in order to avoid crashing csd. However it can't address
    ^^^^^^^^

WARNING: Do not use whitespace before Signed-off-by:
#36:
    Signed-off-by: Liu Kui <kui....@acronis.com>

WARNING: Do not use whitespace before Acked-by:
#37:
    Acked-by: Alexey Kuznetsov <kuz...@acronis.com>

WARNING: Block comments should align the * on each line
#101: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:515:
+       /*
+       * Return RDMA_READ_ACK directly if the original request msg had been 
killed,

WARNING: line length of 112 exceeds 100 columns
#133: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:856:
+ * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to be dropped.

WARNING: Block comments should align the * on each line
#133: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:856:
+               /*
+ * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to be dropped.

WARNING: line length of 104 exceeds 100 columns
#134: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:857:
+               * Missing ack will result in out of order ACK to our peer, 
which will cause it to crash.

WARNING: line length of 111 exceeds 100 columns
#135: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:858:
+ * So we setup a job to ack this msg however it can only be sent out after all ongoing RDMA READ

ERROR: Missing Signed-off-by: line by nominal patch author ''

total: 1 errors, 10 warnings, 129 lines checked

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 23.11.2023 08:26, Kui Liu wrote:
In our userspace RDMA implementation, it is required that every

RIO_MSG_RDMA_READ_REQ msg must be acked strictly in order. However

this rule can be broken due to a bug in kio, which though is

triggered by very abnormal hardware behaviour that it can take

very long time(>10s)  for a WR to complete.

This happens in the read workload with large block size that the

the client needs to issue RDMA READ wr to pull the data portion

of a response msg returned by csd. When this operation takes very

long time to complete for a msg, it will block responses to requests

after it from being sent out by csd for as long as it can take.

As a result, these requests will be killed due to calendar timout.

However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ

msg, they will be ignored silently due to missing reqeust msg without

returning corresponding RIO_MSG_RDMA_RAD_ACK back, therefore breaks the

expectation of ordered ack on the side of csd. Since the rio connection

is still in working state, a later valid msg exchange will trigger

the BUGON check of rb->xid in csd, causing it to crash.

This patch makes sure client will always ack every RIO_MSG_RDMA_READ_REQ

recevied and in order to avoid crashing csd. However it can't address

any performance impact due to the strange hardware behaviour that it

takes abnorma long time for a WR to complete.

https://pmc.acronis.work/browse/VSTOR-76834

https://pmc.acronis.work/browse/VSTOR-70758

https://pmc.acronis.work/browse/VSTOR-60807

https://pmc.acronis.work/browse/VSTOR-57903

Signed-off-by: Liu Kui <kui....@acronis.com>

---

fs/fuse/kio/pcs/pcs_rdma_io.c | 58 +++++++++++++++++++++++++++++++----

fs/fuse/kio/pcs/pcs_rdma_io.h |  3 ++

2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c

index 62d138c8b611..c78126ab1d79 100644

--- a/fs/fuse/kio/pcs/pcs_rdma_io.c

+++ b/fs/fuse/kio/pcs/pcs_rdma_io.c

@@ -130,6 +130,8 @@ static void rio_abort(struct pcs_rdmaio *rio, int error);

static void rio_rx_done(struct rio_cqe *cqe, bool sync_mode);

static void rio_tx_done(struct rio_cqe *cqe, bool sync_mode);

static void rio_tx_err_occured(struct rio_cqe *cqe, bool sync_mode);

+static int rio_submit(struct pcs_rdmaio *rio, struct pcs_msg *msg, int type, 
u64 xid, int status,

+                                    bool allow_again);

/* Only called when rio->write_queue is not empty */

static struct pcs_msg *rio_dequeue_msg(struct pcs_rdmaio *rio)

@@ -424,6 +426,10 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, 
struct pcs_msg *msg,

                struct pcs_rdma_device *dev = rio->dev;

                struct rio_tx *tx;

+              /* Blocked until after pending RDMA_READ_ACKs are sent out to 
keep ACK in order */

+              if (rio->n_rdma_read_ack_pending)

+                              return -EAGAIN;

+

                tx = RE_NULL(rio_get_tx(dev));

                if (!tx) {

                                if (allow_again)

@@ -467,6 +473,8 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, 
struct pcs_msg *msg,

                                }

                }

+              rio->n_rdma_read_ongoing++;

+

                return 0;

fail:

@@ -478,6 +486,21 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, 
struct pcs_msg *msg,

                return -EIO;

}

+static int rio_submit_rdma_read_ack(struct pcs_rdmaio *rio,  u64 xid)

+{

+              int ret;

+

+              /* Can only be sent after all ongoing RDMA_READ_REQs complete to 
keep ack in order */

+              if (rio->n_rdma_read_ongoing)

+                              return -EAGAIN;

+

+              ret = rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, 0, true);

+              if (!ret)

+                              rio->n_rdma_read_ack_pending--;

+

+              return ret;

+}

+

static int rio_rdma_read_job_work(struct rio_job *j)

{

                struct rio_rdma_read_job *job = container_of(j, struct 
rio_rdma_read_job, job);

@@ -488,8 +511,15 @@ static int rio_rdma_read_job_work(struct rio_job *j)

                                return 0;

                }

-               return rio_submit_rdma_read(rio, job->msg, job->offset,

-                                                                   &job->rb, 
true);

+              /*

+              * Return RDMA_READ_ACK directly if the original request msg had 
been killed,

+              * however must wait until all previous RDMA_READ_REQs have been 
acked.

+              */

+              if (job->msg == PCS_TRASH_MSG)

+                              return rio_submit_rdma_read_ack(rio, 
job->rb.xid);

+              else

+                              return rio_submit_rdma_read(rio, job->msg, 
job->offset,

+                                                                                  
&job->rb, true);

}

static void rio_rdma_read_job_destroy(struct rio_job *j)

@@ -766,6 +796,7 @@ static void rio_handle_tx(struct pcs_rdmaio *rio, struct 
rio_tx *tx, int ok)

                                case TX_SUBMIT_RDMA_READ_ACK:

                                                rio_put_tx(rio->dev, tx);

                                               rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, !ok, false);

+                                              rio->n_rdma_read_ongoing--;

                                                break;

                                case TX_WAIT_FOR_TX_COMPL:

                                case TX_WAIT_FOR_READ_ACK:

@@ -798,6 +829,7 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, 
char *buf, int len,

                u32 msg_size;

                int offset = rio->hdr_size;

                struct iov_iter it;

+              struct rio_rdma_read_job *job;

                if (rio->throttled) {

                                *throttle = 1;

@@ -820,6 +852,19 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, 
char *buf, int len,

                                return err;

                } else if (msg == PCS_TRASH_MSG) {

                                TRACE("rio drop trash msg: %u, rio: 0x%p\n", 
msg_size, rio);

+                              /*

+                              * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to be dropped.

+                              * Missing ack will result in out of order ACK to our peer, which will cause it to crash.

+                              * So we setup a job to ack this msg however it can only be sent out after all ongoing RDMA READ

+                              * completes and will block future RDMA READ 
being issued.

+                              */

+                              if (rb) {

+                                              job = 
rio_rdma_read_job_alloc(rio, msg, 0, rb);

+                                              if (!job)

+                                                              return 
PCS_ERR_NOMEM;

+                                              rio_post_tx_job(rio, &job->job);

+                                              rio->n_rdma_read_ack_pending++;

+                              }

                                return 0;

                }

@@ -852,12 +897,10 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio 
*rio, char *buf, int len,

                if (len == msg->size) {

                                msg->done(msg);

                } else if (rio_submit_rdma_read(rio, msg, offset, rb, true) == 
-EAGAIN) {

-                               struct rio_rdma_read_job *job;

                                job = rio_rdma_read_job_alloc(rio, msg, offset, 
rb);

                                if (!job)

-                                               rio_submit_rdma_read(rio, msg, 
offset, rb, false);

-                               else

-                                               rio_post_tx_job(rio, &job->job);

+                                              return PCS_ERR_NOMEM;

+                              rio_post_tx_job(rio, &job->job);

                }

                return 0;

@@ -1228,6 +1271,9 @@ struct pcs_rdmaio* pcs_rdma_create(int hdr_size, struct 
rdma_cm_id *cmid,

                rio->n_os_credits = 0;

                rio->n_th_credits = queue_depth / 2;

+              rio->n_rdma_read_ongoing = 0;

+              rio->n_rdma_read_ack_pending = 0;

+

                rio->cmid = cmid;

                INIT_LIST_HEAD(&rio->write_queue);

diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.h b/fs/fuse/kio/pcs/pcs_rdma_io.h

index 18962208e4a2..c5109cbc5fe1 100644

--- a/fs/fuse/kio/pcs/pcs_rdma_io.h

+++ b/fs/fuse/kio/pcs/pcs_rdma_io.h

@@ -90,6 +90,9 @@ struct pcs_rdmaio

                int n_th_credits;   /* threshold: when to return outstanding

                                                     * credits urgently */

+              int n_rdma_read_ongoing;           /* number of ongoing 
RDMA_READ. */

+              int n_rdma_read_ack_pending; /* number of RDMA_READ_ACK to be 
submitted */

+

                struct pcs_rdma_device *dev;

                struct rdma_cm_id *cmid;

                struct ib_cq *cq;

--

2.32.0 (Apple Git-132)

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to