Hi Liu,
can you please check your patches with checkpatch.pl before sending?
i will appreciate a lot.
Thank you.
(khorenko@alma9)/ttt/git/vzkernel:git show > /tmp/diff-1
(khorenko@alma9)/ttt/git/vzkernel:scripts/checkpatch.pl /tmp/diff-1
WARNING: 'timout' may be misspelled - perhaps 'timeout'?
#18:
As a result, these requests will be killed due to calendar timout.
^^^^^^
WARNING: Possible unwrapped commit description (prefer a maximum 75 chars per
line)
#19:
However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ
WARNING: 'recevied' may be misspelled - perhaps 'received'?
#27:
recevied and in order to avoid crashing csd. However it can't address
^^^^^^^^
WARNING: Do not use whitespace before Signed-off-by:
#36:
Signed-off-by: Liu Kui <kui....@acronis.com>
WARNING: Do not use whitespace before Acked-by:
#37:
Acked-by: Alexey Kuznetsov <kuz...@acronis.com>
WARNING: Block comments should align the * on each line
#101: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:515:
+ /*
+ * Return RDMA_READ_ACK directly if the original request msg had been
killed,
WARNING: line length of 112 exceeds 100 columns
#133: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:856:
+ * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to
be dropped.
WARNING: Block comments should align the * on each line
#133: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:856:
+ /*
+ * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to
be dropped.
WARNING: line length of 104 exceeds 100 columns
#134: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:857:
+ * Missing ack will result in out of order ACK to our peer,
which will cause it to crash.
WARNING: line length of 111 exceeds 100 columns
#135: FILE: fs/fuse/kio/pcs/pcs_rdma_io.c:858:
+ * So we setup a job to ack this msg however it can only be sent out after all ongoing
RDMA READ
ERROR: Missing Signed-off-by: line by nominal patch author ''
total: 1 errors, 10 warnings, 129 lines checked
--
Best regards,
Konstantin Khorenko,
Virtuozzo Linux Kernel Team
On 23.11.2023 08:26, Kui Liu wrote:
In our userspace RDMA implementation, it is required that every
RIO_MSG_RDMA_READ_REQ msg must be acked strictly in order. However
this rule can be broken due to a bug in kio, which though is
triggered by very abnormal hardware behaviour that it can take
very long time(>10s) for a WR to complete.
This happens in the read workload with large block size that the
the client needs to issue RDMA READ wr to pull the data portion
of a response msg returned by csd. When this operation takes very
long time to complete for a msg, it will block responses to requests
after it from being sent out by csd for as long as it can take.
As a result, these requests will be killed due to calendar timout.
However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ
msg, they will be ignored silently due to missing reqeust msg without
returning corresponding RIO_MSG_RDMA_RAD_ACK back, therefore breaks the
expectation of ordered ack on the side of csd. Since the rio connection
is still in working state, a later valid msg exchange will trigger
the BUGON check of rb->xid in csd, causing it to crash.
This patch makes sure client will always ack every RIO_MSG_RDMA_READ_REQ
recevied and in order to avoid crashing csd. However it can't address
any performance impact due to the strange hardware behaviour that it
takes abnorma long time for a WR to complete.
https://pmc.acronis.work/browse/VSTOR-76834
https://pmc.acronis.work/browse/VSTOR-70758
https://pmc.acronis.work/browse/VSTOR-60807
https://pmc.acronis.work/browse/VSTOR-57903
Signed-off-by: Liu Kui <kui....@acronis.com>
---
fs/fuse/kio/pcs/pcs_rdma_io.c | 58 +++++++++++++++++++++++++++++++----
fs/fuse/kio/pcs/pcs_rdma_io.h | 3 ++
2 files changed, 55 insertions(+), 6 deletions(-)
diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c
index 62d138c8b611..c78126ab1d79 100644
--- a/fs/fuse/kio/pcs/pcs_rdma_io.c
+++ b/fs/fuse/kio/pcs/pcs_rdma_io.c
@@ -130,6 +130,8 @@ static void rio_abort(struct pcs_rdmaio *rio, int error);
static void rio_rx_done(struct rio_cqe *cqe, bool sync_mode);
static void rio_tx_done(struct rio_cqe *cqe, bool sync_mode);
static void rio_tx_err_occured(struct rio_cqe *cqe, bool sync_mode);
+static int rio_submit(struct pcs_rdmaio *rio, struct pcs_msg *msg, int type,
u64 xid, int status,
+ bool allow_again);
/* Only called when rio->write_queue is not empty */
static struct pcs_msg *rio_dequeue_msg(struct pcs_rdmaio *rio)
@@ -424,6 +426,10 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio,
struct pcs_msg *msg,
struct pcs_rdma_device *dev = rio->dev;
struct rio_tx *tx;
+ /* Blocked until after pending RDMA_READ_ACKs are sent out to
keep ACK in order */
+ if (rio->n_rdma_read_ack_pending)
+ return -EAGAIN;
+
tx = RE_NULL(rio_get_tx(dev));
if (!tx) {
if (allow_again)
@@ -467,6 +473,8 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio,
struct pcs_msg *msg,
}
}
+ rio->n_rdma_read_ongoing++;
+
return 0;
fail:
@@ -478,6 +486,21 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio,
struct pcs_msg *msg,
return -EIO;
}
+static int rio_submit_rdma_read_ack(struct pcs_rdmaio *rio, u64 xid)
+{
+ int ret;
+
+ /* Can only be sent after all ongoing RDMA_READ_REQs complete to
keep ack in order */
+ if (rio->n_rdma_read_ongoing)
+ return -EAGAIN;
+
+ ret = rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, 0, true);
+ if (!ret)
+ rio->n_rdma_read_ack_pending--;
+
+ return ret;
+}
+
static int rio_rdma_read_job_work(struct rio_job *j)
{
struct rio_rdma_read_job *job = container_of(j, struct
rio_rdma_read_job, job);
@@ -488,8 +511,15 @@ static int rio_rdma_read_job_work(struct rio_job *j)
return 0;
}
- return rio_submit_rdma_read(rio, job->msg, job->offset,
- &job->rb,
true);
+ /*
+ * Return RDMA_READ_ACK directly if the original request msg had
been killed,
+ * however must wait until all previous RDMA_READ_REQs have been
acked.
+ */
+ if (job->msg == PCS_TRASH_MSG)
+ return rio_submit_rdma_read_ack(rio,
job->rb.xid);
+ else
+ return rio_submit_rdma_read(rio, job->msg,
job->offset,
+
&job->rb, true);
}
static void rio_rdma_read_job_destroy(struct rio_job *j)
@@ -766,6 +796,7 @@ static void rio_handle_tx(struct pcs_rdmaio *rio, struct
rio_tx *tx, int ok)
case TX_SUBMIT_RDMA_READ_ACK:
rio_put_tx(rio->dev, tx);
rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, !ok,
false);
+ rio->n_rdma_read_ongoing--;
break;
case TX_WAIT_FOR_TX_COMPL:
case TX_WAIT_FOR_READ_ACK:
@@ -798,6 +829,7 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio,
char *buf, int len,
u32 msg_size;
int offset = rio->hdr_size;
struct iov_iter it;
+ struct rio_rdma_read_job *job;
if (rio->throttled) {
*throttle = 1;
@@ -820,6 +852,19 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio,
char *buf, int len,
return err;
} else if (msg == PCS_TRASH_MSG) {
TRACE("rio drop trash msg: %u, rio: 0x%p\n",
msg_size, rio);
+ /*
+ * We must Ack every RDMA_READ_REQ received from our peer in order even
it's going to be dropped.
+ * Missing ack will result in out of order ACK to our peer, which will
cause it to crash.
+ * So we setup a job to ack this msg however it can only be sent out
after all ongoing RDMA READ
+ * completes and will block future RDMA READ
being issued.
+ */
+ if (rb) {
+ job =
rio_rdma_read_job_alloc(rio, msg, 0, rb);
+ if (!job)
+ return
PCS_ERR_NOMEM;
+ rio_post_tx_job(rio, &job->job);
+ rio->n_rdma_read_ack_pending++;
+ }
return 0;
}
@@ -852,12 +897,10 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio
*rio, char *buf, int len,
if (len == msg->size) {
msg->done(msg);
} else if (rio_submit_rdma_read(rio, msg, offset, rb, true) ==
-EAGAIN) {
- struct rio_rdma_read_job *job;
job = rio_rdma_read_job_alloc(rio, msg, offset,
rb);
if (!job)
- rio_submit_rdma_read(rio, msg,
offset, rb, false);
- else
- rio_post_tx_job(rio, &job->job);
+ return PCS_ERR_NOMEM;
+ rio_post_tx_job(rio, &job->job);
}
return 0;
@@ -1228,6 +1271,9 @@ struct pcs_rdmaio* pcs_rdma_create(int hdr_size, struct
rdma_cm_id *cmid,
rio->n_os_credits = 0;
rio->n_th_credits = queue_depth / 2;
+ rio->n_rdma_read_ongoing = 0;
+ rio->n_rdma_read_ack_pending = 0;
+
rio->cmid = cmid;
INIT_LIST_HEAD(&rio->write_queue);
diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.h b/fs/fuse/kio/pcs/pcs_rdma_io.h
index 18962208e4a2..c5109cbc5fe1 100644
--- a/fs/fuse/kio/pcs/pcs_rdma_io.h
+++ b/fs/fuse/kio/pcs/pcs_rdma_io.h
@@ -90,6 +90,9 @@ struct pcs_rdmaio
int n_th_credits; /* threshold: when to return outstanding
* credits urgently */
+ int n_rdma_read_ongoing; /* number of ongoing
RDMA_READ. */
+ int n_rdma_read_ack_pending; /* number of RDMA_READ_ACK to be
submitted */
+
struct pcs_rdma_device *dev;
struct rdma_cm_id *cmid;
struct ib_cq *cq;
--
2.32.0 (Apple Git-132)
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel