Hi Keith,

Thanks for the patch, few comments below mostly nitpicks.

On 5/10/19 2:35 PM, Keith Busch wrote:
> The nvme pci driver prepares its devices for power loss during suspend
> by shutting down the controllers, and the power setting is deferred to
> pci driver's power management before the platform removes power. The
> suspend-to-idle mode, however, does not remove power.
> 
> NVMe devices that implement host managed power settings can achieve
> lower power and better transition latencies than using generic PCI
> power settings. Try to use this feature if the platform is not involved
> with the suspend. If successful, restore the previous power state on
> resume.
> 
> Cc: Mario Limonciello <mario.limoncie...@dell.com>
> Cc: Kai Heng Feng <kai.heng.f...@canonical.com>
> Signed-off-by: Keith Busch <keith.bu...@intel.com>
> ---
> Disclaimer: I've tested only on emulation faking support for the feature.
> 
> General question: different devices potentially have divergent values
> for power consumption and transition latencies. Would it be useful to
> allow a user tunable setting to select the desired target power state
> instead of assuming the lowest one?
> 
>   drivers/nvme/host/core.c | 27 ++++++++++++++++++++++++
>   drivers/nvme/host/nvme.h |  2 ++
>   drivers/nvme/host/pci.c  | 53 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 82 insertions(+)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index a6644a2c3ef7..eb3640fd8838 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1132,6 +1132,33 @@ static int nvme_set_features(struct nvme_ctrl *dev, 
> unsigned fid, unsigned dword
>       return ret;
>   }
>   
> +int nvme_set_power(struct nvme_ctrl *ctrl, unsigned ps)
dev->ctrl.npss is u8 can we use same data type here ?
If this is due to last_ps we use as a result and then call set_power may 
be we can change the type of last_ps ?
OR
can we please use unsigned int to avoid possible warnings ?
> +{
> +     return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
> +}
> +EXPORT_SYMBOL_GPL(nvme_set_power);
> +
> +int nvme_get_power(struct nvme_ctrl *ctrl, u32 *result)
> +{
> +     struct nvme_command c;
May be use struct nvme_command c {} so we can get rid of the memset() call.
> +     union nvme_result res;
> +     int ret;
> +
> +     if (!result)
> +             return -EINVAL;
> +
> +     memset(&c, 0, sizeof(c));
> +     c.features.opcode = nvme_admin_get_features;
> +     c.features.fid = cpu_to_le32(NVME_FEAT_POWER_MGMT);
> +
> +     ret = __nvme_submit_sync_cmd(ctrl->admin_q, &c, &res,
> +                     NULL, 0, 0, NVME_QID_ANY, 0, 0, false);
> +     if (ret >= 0)
> +             *result = le32_to_cpu(res.u32);
May be add a check for result here in above if before deref pointer :-
        if (ret >= 0 && result)

> +     return ret;
> +}
> +EXPORT_SYMBOL_GPL(nvme_get_power);
> +
>   int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
>   {
>       u32 q_count = (*count - 1) | ((*count - 1) << 16);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 5ee75b5ff83f..eaa571ac06d2 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -459,6 +459,8 @@ int __nvme_submit_sync_cmd(struct request_queue *q, 
> struct nvme_command *cmd,
>               unsigned timeout, int qid, int at_head,
>               blk_mq_req_flags_t flags, bool poll);
>   int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
> +int nvme_set_power(struct nvme_ctrl *ctrl, unsigned ps);
> +int nvme_get_power(struct nvme_ctrl *ctrl, u32 *result);
>   void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
>   int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
>   int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 3e4fb891a95a..0d5d91e5b293 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -18,6 +18,7 @@
>   #include <linux/mutex.h>
>   #include <linux/once.h>
>   #include <linux/pci.h>
> +#include <linux/suspend.h>
>   #include <linux/t10-pi.h>
>   #include <linux/types.h>
>   #include <linux/io-64-nonatomic-lo-hi.h>
> @@ -116,6 +117,7 @@ struct nvme_dev {
>       u32 cmbsz;
>       u32 cmbloc;
>       struct nvme_ctrl ctrl;
> +     u32 last_ps;
>   
>       mempool_t *iod_mempool;
>   
> @@ -2828,11 +2830,59 @@ static void nvme_remove(struct pci_dev *pdev)
>   }
>   
>   #ifdef CONFIG_PM_SLEEP
> +static int nvme_deep_state(struct nvme_dev *dev)
> +{
> +     struct pci_dev *pdev = to_pci_dev(dev->dev);
> +     int ret;
> +
> +     /*
> +      * Save the current power state in case a user tool set a power policy
> +      * for this device. We'll restore that state on resume.
> +      */
> +     dev->last_ps = 0;
Should we only overwrite dev->last_ps value after nvme_get_power() is 
successful and use temp variable to fetch the current power value in the 
following call ?
> +     ret = nvme_get_power(&dev->ctrl, &dev->last_ps);
> +
> +     /*
> +      * Return the error to halt suspend if the driver either couldn't
> +      * submit a command or didn't see a response.
> +      */
> +     if (ret < 0)
> +             return ret;
> +
> +     ret = nvme_set_power(&dev->ctrl, dev->ctrl.npss);
> +     if (ret < 0)
> +             return ret;

> +
> +     if (!ret) {
> +             /*
> +              * A saved state prevents pci pm from generically controlling
> +              * the device's power. We're using protocol specific settings
> +              * so we don't want pci interfering.
> +              */
> +             pci_save_state(pdev);
> +     } else {
> +             /*
> +              * The drive failed the low power request. Fallback to device
> +              * shutdown and clear npss to force a controller reset on
> +              * resume. The value will be rediscovered during reset.
> +              */
> +             dev->ctrl.npss = 0;
> +             nvme_dev_disable(dev, true);
> +     }
> +     return 0;
> +}
> +
>   static int nvme_suspend(struct device *dev)
>   {
>       struct pci_dev *pdev = to_pci_dev(dev);
>       struct nvme_dev *ndev = pci_get_drvdata(pdev);
>   
> +     /*
> +      * Try to use nvme if the device supports host managed power settings
> +      * and platform firmware is not involved.
> +      */
> +     if (!pm_suspend_via_firmware() && ndev->ctrl.npss)
> +             return nvme_deep_state(ndev);
>       nvme_dev_disable(ndev, true);
>       return 0;
>   }
> @@ -2842,6 +2892,9 @@ static int nvme_resume(struct device *dev)
>       struct pci_dev *pdev = to_pci_dev(dev);
>       struct nvme_dev *ndev = pci_get_drvdata(pdev);
>   
> +     if (!pm_resume_via_firmware() && ndev->ctrl.npss)
> +             if (nvme_set_power(&ndev->ctrl, ndev->last_ps) == 0)
> +                     return 0;
>       nvme_reset_ctrl(&ndev->ctrl);
>       return 0;
>   }
> 

Reply via email to