Re: [PATCH v6 1/3] cxlflash: Base error recovery support

Matthew R. Ochs Mon, 17 Aug 2015 17:30:47 -0700

Hi Brian,

Thanks for reviewing.


All good suggestions that I am fine with implementing. As these are fairly 
minor,
would you be okay with these being made in a separate ‘fix' patch series?

-matt

> On Aug 17, 2015, at 9:38 AM, Brian King <brk...@linux.vnet.ibm.com> wrote:
> 
> On 08/13/2015 09:47 PM, Matthew R. Ochs wrote:
>> --- a/drivers/scsi/cxlflash/common.h
>> +++ b/drivers/scsi/cxlflash/common.h
>> @@ -76,6 +76,12 @@ enum cxlflash_init_state {
>>      INIT_STATE_SCSI
>> };
>> 
>> +enum cxlflash_state {
>> +    STATE_NORMAL,   /* Normal running state, everything good */
>> +    STATE_LIMBO,    /* Limbo running state, trying to reset/recover */
> 
> Might be more clear to call this STATE_RESETTING or STATE_RECOVERY
> 
>> +    STATE_FAILTERM  /* Failed/terminating state, error out users/threads */
>> +};
>> +
>> /*
>>  * Each context has its own set of resource handles that is visible
>>  * only from that context.
> 
>> @@ -105,7 +109,8 @@ struct cxlflash_cfg {
>> 
>>      wait_queue_head_t tmf_waitq;
>>      bool tmf_active;
>> -    u8 err_recovery_active:1;
>> +    wait_queue_head_t limbo_waitq;
> 
> How about reset_waitq instead?
> 
>> +    enum cxlflash_state state;
>> };
>> 
>> struct afu_cmd {
> 
>> @@ -455,9 +471,21 @@ static int cxlflash_eh_device_reset_handler(struct 
>> scsi_cmnd *scp)
>>               get_unaligned_be32(&((u32 *)scp->cmnd)[2]),
>>               get_unaligned_be32(&((u32 *)scp->cmnd)[3]));
>> 
>> -    rcr = send_tmf(afu, scp, TMF_LUN_RESET);
>> -    if (unlikely(rcr))
>> +    switch (cfg->state) {
>> +    case STATE_NORMAL:
>> +            rcr = send_tmf(afu, scp, TMF_LUN_RESET);
>> +            if (unlikely(rcr))
>> +                    rc = FAILED;
>> +            break;
>> +    case STATE_LIMBO:
>> +            wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO);
>> +            if (cfg->state == STATE_NORMAL)
>> +                    break;
> 
> In this case you've been asked to do a LUN reset but didn't actually reset 
> the LUN. I'd suggest
> restructuring this switch statement to send the LUN reset TMF in the case 
> where you had
> to wait for an AFU reset to complete.
> 
>> +            /* fall through */
>> +    default:
>>              rc = FAILED;
>> +            break;
>> +    }
>> 
>>      pr_debug("%s: returning rc=%d\n", __func__, rc);
>>      return rc;
>> @@ -487,11 +515,29 @@ static int cxlflash_eh_host_reset_handler(struct 
>> scsi_cmnd *scp)
>>               get_unaligned_be32(&((u32 *)scp->cmnd)[2]),
>>               get_unaligned_be32(&((u32 *)scp->cmnd)[3]));
>> 
>> -    rcr = cxlflash_afu_reset(cfg);
>> -    if (rcr == 0)
>> -            rc = SUCCESS;
>> -    else
>> +    switch (cfg->state) {
>> +    case STATE_NORMAL:
>> +            cfg->state = STATE_LIMBO;
>> +            scsi_block_requests(cfg->host);
>> +
>> +            rcr = cxlflash_afu_reset(cfg);
>> +            if (rcr) {
>> +                    rc = FAILED;
>> +                    cfg->state = STATE_FAILTERM;
>> +            } else
>> +                    cfg->state = STATE_NORMAL;
>> +            wake_up_all(&cfg->limbo_waitq);
>> +            scsi_unblock_requests(cfg->host);
> 
> The scsi_block_requests / scsi_unblock_requests is not necessary in this 
> path, since SCSI EH
> will already be preventing any new commands being issued via queuecommand.
> 
>> +            break;
>> +    case STATE_LIMBO:
>> +            wait_event(cfg->limbo_waitq, cfg->state != STATE_LIMBO);
>> +            if (cfg->state == STATE_NORMAL)
>> +                    break;
>> +            /* fall through */
>> +    default:
>>              rc = FAILED;
>> +            break;
>> +    }
>> 
>>      pr_debug("%s: returning rc=%d\n", __func__, rc);
>>      return rc;
>> @@ -642,7 +688,7 @@ static void cxlflash_wait_for_pci_err_recovery(struct 
>> cxlflash_cfg *cfg)
>>      struct pci_dev *pdev = cfg->dev;
>> 
>>      if (pci_channel_offline(pdev))
>> -            wait_event_timeout(cfg->eeh_waitq,
>> +            wait_event_timeout(cfg->limbo_waitq,
>>                                 !pci_channel_offline(pdev),
>>                                 CXLFLASH_PCI_ERROR_RECOVERY_TIMEOUT);
>> }
>> @@ -825,6 +871,8 @@ static void cxlflash_remove(struct pci_dev *pdev)
>>                                                  !cfg->tmf_active);
>>      spin_unlock_irqrestore(&cfg->tmf_waitq.lock, lock_flags);
>> 
>> +    cfg->state = STATE_FAILTERM;
> 
> I don't see any locking around the setting or reading of this flag. What are 
> the
> implications if the processor reorders the store to change this state either 
> here
> or elsewhere. Same goes for the load associated with checking the state.
> 
>> +
>>      switch (cfg->init_state) {
>>      case INIT_STATE_SCSI:
>>              scsi_remove_host(cfg->host);
>> @@ -1879,6 +1927,8 @@ static int init_afu(struct cxlflash_cfg *cfg)
>>      struct afu *afu = cfg->afu;
>>      struct device *dev = &cfg->dev->dev;
>> 
>> +    cxl_perst_reloads_same_image(cfg->cxl_afu, true);
>> +
>>      rc = init_mc(cfg);
>>      if (rc) {
>>              dev_err(dev, "%s: call to init_mc failed, rc=%d!\n",
> 
> Reviewed-by: Brian King <brk...@linux.vnet.ibm.com>
> 
> -- 
> Brian King
> Power Linux I/O
> IBM Linux Technology Center
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v6 1/3] cxlflash: Base error recovery support

Reply via email to