Hi Ben,

I like this approach, but I have a few questions.

On Thu, 2025-07-10 at 14:10 -0400, Benjamin Marzinski wrote:
> The workaround for releasing a reservation held by a failed path
> (clearing all persistent reservation data and then reregistering all
> the
> keys) has several problems. It requires devices that support the READ
> FULL STATUS command and are capable of specifying TransportIDs with
> REGISTRATION commands (SIP_C), neither of which are mandatory to
> support
> SCSI Persistent Reservations.

While I have made similar experiences, I wonder where the SCSI specs
state which service actions and which parameters are mandatory for
Persistent Reservation commands.

I've tried to figure it out from SPC-6, but I couldn't.

>  Non SIP_C devices will be left with no
> registered keys. Also, not all cleared keys are registered, just the
> ones going to the Target Port receiving the REGISTRATION command. To
> reregister all the keys, the code would have to also use the "All
> Target
> Ports" flag to register keys on different Target Ports, but this
> could
> end up registering keys on paths they aren't supposed to be on (for
> instance if one of the registered keys was only there because the
> path
> was down and it couldn't be released).
> 
> The redesign avoids these issues by only using mandatory Persistent
> Reservation commands, without extra optional parameters or flags, and
> only effects the keys of the multipath device it is being issued on.
> The new workaround is:
> 1. Suspend the multipath device to prevent I/O
> 2. Preempt the key to move the reservation to an available path. This
>    also removes the registered keys from every path except the path
>    issuing the PREEMPT command. Since the device is suspended, not
> I/O
>    can go to these unregisted paths and fail.
> 3. Release the reservation on the path that now holds it.

Why that? Can't the reservation just be kept?

> 4. Resume the device (since it no longer matters that most of the
> paths
>    no longer have a registered key)
> 5. Reregister the keys on all the paths.

Why not re-register before resuming?

> 
> If steps 3 or 4 fail, the code will attempt to reregister the keys,
> and
> then attempt (or possibly re-attempt) the resume.
> 
> Signed-off-by: Benjamin Marzinski <bmarz...@redhat.com>
> ---
>  libmpathpersist/mpath_persist_int.c | 173 ++++++++++++--------------
> --
>  libmultipath/libmultipath.version   |   1 +
>  2 files changed, 76 insertions(+), 98 deletions(-)
> 
> diff --git a/libmpathpersist/mpath_persist_int.c
> b/libmpathpersist/mpath_persist_int.c
> index 13829742..7fb08b2e 100644
> --- a/libmpathpersist/mpath_persist_int.c
> +++ b/libmpathpersist/mpath_persist_int.c
> @@ -366,7 +366,8 @@ static int mpath_prout_reg(struct multipath
> *mpp,int rq_servact, int rq_scope,
>  
>  static int mpath_prout_common(struct multipath *mpp,int rq_servact,
> int rq_scope,
>                             unsigned int rq_type,
> -                           struct prout_param_descriptor* paramp,
> int noisy)
> +                           struct prout_param_descriptor* paramp,
> int noisy,
> +                           struct path **pptr)
>  {
>       int i,j, ret;
>       struct pathgroup *pgp = NULL;
> @@ -385,6 +386,8 @@ static int mpath_prout_common(struct multipath
> *mpp,int rq_servact, int rq_scope
>                       found = true;
>                       ret = prout_do_scsi_ioctl(pp->dev,
> rq_servact, rq_scope,
>                                                 rq_type, paramp,
> noisy);
> +                     if (ret == MPATH_PR_SUCCESS && pptr)
> +                             *pptr = pp;
>                       if (ret != MPATH_PR_RETRYABLE_ERROR)
>                               return ret;
>               }
> @@ -405,14 +408,12 @@ static int mpath_prout_rel(struct multipath
> *mpp,int rq_servact, int rq_scope,
>       struct path *pp = NULL;
>       int active_pathcount = 0;
>       pthread_attr_t attr;
> -     int rc, found = 0;
> +     int rc;
>       int count = 0;
>       int status = MPATH_PR_SUCCESS;
> -     struct prin_resp resp;
> -     struct prout_param_descriptor *pamp = NULL;
> -     struct prin_resp *pr_buff;
> -     int length;
> -     struct transportid *pptr = NULL;
> +     struct prin_resp resp = {0};
> +     uint16_t udev_flags = (mpp->skip_kpartx) ?
> MPATH_UDEV_NO_KPARTX_FLAG : 0;
> +     bool did_resume = false;
>  
>       if (!mpp)
>               return MPATH_PR_DMMP_ERROR;
> @@ -502,104 +503,78 @@ static int mpath_prout_rel(struct multipath
> *mpp,int rq_servact, int rq_scope,
>       }
>  
>       condlog (2, "%s: Path holding reservation is not
> available.", mpp->wwid);
> -
> -     pr_buff =  mpath_alloc_prin_response(MPATH_PRIN_RFSTAT_SA);
> -     if (!pr_buff){
> -             condlog (0, "%s: failed to  alloc pr in response
> buffer.", mpp->wwid);
> +     /*
> +      * Cannot free the reservation because the path that is
> holding it
> +      * is not usable. Workaround this by:
> +      * 1. Suspending the device
> +      * 2. Preempting the reservation to move it to a usable path
> +      *    (this removes the registered keys on all paths except
> the
> +      *    preempting one. Since the device is suspended, no IO
> can
> +      *    go to these unregistered paths and fail).
> +      * 3. Releasing the reservation on the path that now holds
> it.
> +      * 4. Resuming the device (since it no longer matters that
> most of
> +      *    that paths no longer have a registered key)
> +      * 5. Reregistering keys on all the paths
> +      */
> +
> +     if (!dm_simplecmd_noflush(DM_DEVICE_SUSPEND, mpp->alias, 0))
> {
> +             condlog(0, "%s: release: failed to suspend dm
> device.",

Why do you use dm_simplecmd_noflush() here? Shouldn't queued IO be
flushed from the dm device to avoid it being sent to paths that are
going to be unregistered?


> mpp->wwid);
>               return MPATH_PR_OTHER;
>       }
>  
> -     status = mpath_prin_activepath (mpp, MPATH_PRIN_RFSTAT_SA,
> pr_buff, noisy);
> -
> -     if (status != MPATH_PR_SUCCESS){
> -             condlog (0,  "%s: pr in read full status command
> failed.",  mpp->wwid);
> -             goto out;
> -     }
> -
> -     num = pr_buff-
> >prin_descriptor.prin_readfd.number_of_descriptor;
> -     if (0 == num){
> -             goto out;
> -     }
> -     length = sizeof (struct prout_param_descriptor) + (sizeof
> (struct transportid *));
> -
> -     pamp = (struct prout_param_descriptor *)malloc (length);
> -     if (!pamp){
> -             condlog (0, "%s: failed to alloc pr out parameter.",
> mpp->wwid);
> -             goto out;
> -     }
> -
> -     memset(pamp, 0, length);
> -
> -     pamp->trnptid_list[0] = (struct transportid *) malloc
> (sizeof (struct transportid));
> -     if (!pamp->trnptid_list[0]){
> -             condlog (0, "%s: failed to alloc pr out
> transportid.", mpp->wwid);
> -             goto out1;
> -     }
> -     pptr = pamp->trnptid_list[0];
> -
> -     if (get_be64(mpp->reservation_key)){
> -             memcpy (pamp->key, &mpp->reservation_key, 8);
> -             condlog (3, "%s: reservation key set.", mpp->wwid);
> +     memset(paramp, 0, sizeof(*paramp));
> +     memcpy(paramp->key, &mpp->reservation_key, 8);
> +     memcpy(paramp->sa_key, &mpp->reservation_key, 8);
> +     status = mpath_prout_common(mpp, MPATH_PROUT_PREE_SA,
> rq_scope, rq_type,
> +                                 paramp, noisy, &pp);
> +     if (status != MPATH_PR_SUCCESS) {
> +             condlog(0, "%s: release: pr preempt command
> failed.", mpp->wwid);
> +             goto fail_resume;
>       }
>  
> -     status = mpath_prout_common (mpp, MPATH_PROUT_CLEAR_SA,
> -                                  rq_scope, rq_type, pamp,
> noisy);
> -
> -     if (status) {
> -             condlog(0, "%s: failed to send CLEAR_SA", mpp-
> >wwid);
> -             goto out2;
> +     memset(paramp, 0, sizeof(*paramp));
> +     memcpy(paramp->key, &mpp->reservation_key, 8);
> +     status = prout_do_scsi_ioctl(pp->dev, MPATH_PROUT_REL_SA,
> rq_scope,
> +                                  rq_type, paramp, noisy);
> +     if (status != MPATH_PR_SUCCESS) {
> +             condlog(0, "%s: release on alternate path failed.",
> mpp->wwid);
> +             goto out_reregister;
>       }
>  
> -     pamp->num_transportid = 1;
> -
> -     for (i = 0; i < num; i++){
> -             if (get_be64(mpp->reservation_key) &&
> -                     memcmp(pr_buff-
> >prin_descriptor.prin_readfd.descriptors[i]->key,
> -                            &mpp->reservation_key, 8)){
> -                     /*register with transport id*/
> -                     memset(pamp, 0, length);
> -                     pamp->trnptid_list[0] = pptr;
> -                     memset (pamp->trnptid_list[0], 0, sizeof
> (struct transportid));
> -                     memcpy (pamp->sa_key,
> -                                     pr_buff-
> >prin_descriptor.prin_readfd.descriptors[i]->key, 8);
> -                     pamp->sa_flags = MPATH_F_SPEC_I_PT_MASK;
> -                     pamp->num_transportid = 1;
> -
> -                     memcpy (pamp->trnptid_list[0],
> -                                     &pr_buff-
> >prin_descriptor.prin_readfd.descriptors[i]->trnptid,
> -                                     sizeof (struct
> transportid));
> -                     status = mpath_prout_common (mpp,
> MPATH_PROUT_REG_SA, 0, rq_type,
> -                                     pamp, noisy);
> -
> -                     pamp->sa_flags = 0;
> -                     memcpy (pamp->key, pr_buff-
> >prin_descriptor.prin_readfd.descriptors[i]->key, 8);
> -                     memset (pamp->sa_key, 0, 8);
> -                     pamp->num_transportid = 0;
> -                     status = mpath_prout_common (mpp,
> MPATH_PROUT_REG_SA, 0, rq_type,
> -                                     pamp, noisy);
> -             }
> -             else
> -             {
> -                     if (get_be64(mpp->reservation_key))
> -                             found = 1;
> -             }
> -
> -
> -     }
> -
> -     if (found){
> -             memset (pamp, 0, length);
> -             memcpy (pamp->sa_key, &mpp->reservation_key, 8);
> -             memset (pamp->key, 0, 8);
> -             status = mpath_prout_reg(mpp, MPATH_PROUT_REG_SA,
> rq_scope, rq_type, pamp, noisy);
> +     if (!dm_simplecmd_noflush(DM_DEVICE_RESUME, mpp->alias,
> udev_flags)) {
> +             condlog(0, "%s release: failed to resume dm
> device.", mpp->wwid);
> +             /*
> +              * leave status set to MPATH_PR_SUCCESS, we will
> have another
> +              * chance to resume the device.
> +              */
> +             goto out_reregister;
> +     }
> +     did_resume = true;
> +
> +out_reregister:
> +     memset(paramp, 0, sizeof(*paramp));
> +     memcpy(paramp->sa_key, &mpp->reservation_key, 8);
> +     rc = mpath_prout_reg(mpp, MPATH_PROUT_REG_IGN_SA, rq_scope,
> rq_type,
> +                          paramp, noisy);
> +     if (rc != MPATH_PR_SUCCESS)
> +             condlog(0, "%s: release: failed to reregister
> paths.", mpp->wwid);
> +
> +     /*
> +      * If we failed releasing the reservation or resuming
> earlier
> +      * try resuming now. Otherwise, return with the
> reregistering status
> +      * This means we will report failure, even though the
> resevation
> +      * has been released, since the keys were not reregistered.
> +      */
> +     if (did_resume)
> +             return rc;
> +     else if (status == MPATH_PR_SUCCESS)
> +             status = rc;
> +fail_resume:
> +     if (!dm_simplecmd_noflush(DM_DEVICE_RESUME, mpp->alias,
> udev_flags)) {
> +             condlog(0, "%s: release: failed to resume dm
> device.", mpp->wwid);
> +             if (status == MPATH_PR_SUCCESS)
> +                     status = MPATH_PR_OTHER;
>       }
> -
> -out2:
> -     free(pptr);
> -out1:
> -     free (pamp);
> -out:
> -     free (pr_buff);
>       return (status == MPATH_PR_RETRYABLE_ERROR) ? MPATH_PR_OTHER
> : status;
>  }
>  
> @@ -620,6 +595,7 @@ int do_mpath_persistent_reserve_out(vector curmp,
> vector pathvec, int fd,
>       mpp->mpe = find_mpe(conf->mptable, mpp->wwid);
>       select_reservation_key(conf, mpp);
>       select_all_tg_pt(conf, mpp);
> +     select_skip_kpartx(conf, mpp);

I understand that this is because of the DM_DEVICE_RESUME, but it
deserves a comment here as it seems a bit out of place in this code
that deals only with persistent reservations. Actually, we may want to
move this logic to dm_simplecmd().

Thanks,
Martin

>       put_multipath_config(conf);
>  
>       memcpy(&prkey, paramp->sa_key, 8);
> @@ -661,7 +637,8 @@ int do_mpath_persistent_reserve_out(vector curmp,
> vector pathvec, int fd,
>       case MPATH_PROUT_PREE_SA :
>       case MPATH_PROUT_PREE_AB_SA :
>       case MPATH_PROUT_CLEAR_SA:
> -             ret = mpath_prout_common(mpp, rq_servact, rq_scope,
> rq_type, paramp, noisy);
> +             ret = mpath_prout_common(mpp, rq_servact, rq_scope,
> rq_type,
> +                                      paramp, noisy, NULL);
>               break;
>       case MPATH_PROUT_REL_SA:
>               ret = mpath_prout_rel(mpp, rq_servact, rq_scope,
> rq_type, paramp, noisy);
> diff --git a/libmultipath/libmultipath.version
> b/libmultipath/libmultipath.version
> index a6718355..2f47b3ad 100644
> --- a/libmultipath/libmultipath.version
> +++ b/libmultipath/libmultipath.version
> @@ -249,4 +249,5 @@ local:
>  LIBMULTIPATH_29.1.0 {
>  global:
>       can_recheck_wwid;
> +     select_skip_kpartx;
>  } LIBMULTIPATH_29.0.0;

Reply via email to