On 15.03.2024 16:39, Jeffrey Hugo wrote:
> On 3/14/2024 5:41 AM, Jacek Lawrynowicz wrote:
>> Hi,
>>
>> On 11.03.2024 17:58, Jeffrey Hugo wrote:
>>> During the boot process of AIC100, the bootloaders (PBL and SBL) log
>>> messages to device RAM. During SBL, if the host opens the QAIC_LOGGING
>>> channel, SBL will offload the contents of the log buffer to the host,
>>> and stream any new messages that SBL logs.
>>>
>>> This log of the boot process can be very useful for an initial triage of
>>> any boot related issues. For example, if SBL rejects one of the runtime
>>> firmware images for a validation failure, SBL will log a reason why.
>>>
>>> Add the ability of the driver to open the logging channel, receive the
>>> messages, and store them. Also define a debugfs entry called "bootlog"
>>> by hooking into the DRM debugfs framework. When the bootlog debugfs
>>> entry is read, the current contents of the log that the host is caching
>>> is displayed to the user. The driver will retain the cache until it
>>> detects that the device has rebooted.  At that point, the cache will be
>>> freed, and the driver will wait for a new log. With this scheme, the
>>> driver will only have a cache of the log from the current device boot.
>>> Note that if the driver initializes a device and it is already in the
>>> runtime state (QSM), no bootlog will be available through this mechanism
>>> because the driver and SBL have not communicated.
>>>
>>> Signed-off-by: Jeffrey Hugo <quic_jh...@quicinc.com>
>>> Reviewed-by: Carl Vanderlip <quic_ca...@quicinc.com>
>>> Reviewed-by: Pranjal Ramajor Asha Kanojiya <quic_pkano...@quicinc.com>
>>> ---
>>>   drivers/accel/qaic/Makefile       |   2 +
>>>   drivers/accel/qaic/qaic.h         |   8 +
>>>   drivers/accel/qaic/qaic_debugfs.c | 271 ++++++++++++++++++++++++++++++
>>>   drivers/accel/qaic/qaic_debugfs.h |  20 +++
>>>   drivers/accel/qaic/qaic_drv.c     |  16 +-
>>>   5 files changed, 316 insertions(+), 1 deletion(-)
>>>   create mode 100644 drivers/accel/qaic/qaic_debugfs.c
>>>   create mode 100644 drivers/accel/qaic/qaic_debugfs.h
>>>
>>> diff --git a/drivers/accel/qaic/Makefile b/drivers/accel/qaic/Makefile
>>> index 3f7f6dfde7f2..2cadcc1baa0e 100644
>>> --- a/drivers/accel/qaic/Makefile
>>> +++ b/drivers/accel/qaic/Makefile
>>> @@ -11,3 +11,5 @@ qaic-y := \
>>>       qaic_data.o \
>>>       qaic_drv.o \
>>>       qaic_timesync.o
>>> +
>>> +qaic-$(CONFIG_DEBUG_FS) += qaic_debugfs.o
>>> diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h
>>> index 9256653b3036..03d9c9fbffb3 100644
>>> --- a/drivers/accel/qaic/qaic.h
>>> +++ b/drivers/accel/qaic/qaic.h
>>> @@ -153,6 +153,14 @@ struct qaic_device {
>>>       struct mhi_device    *qts_ch;
>>>       /* Work queue for tasks related to MHI "QAIC_TIMESYNC" channel */
>>>       struct workqueue_struct    *qts_wq;
>>> +    /* Head of list of page allocated by MHI bootlog device */
>>> +    struct list_head        bootlog;
>>> +    /* MHI bootlog channel device */
>>> +    struct mhi_device       *bootlog_ch;
>>> +    /* Work queue for tasks related to MHI bootlog device */
>>> +    struct workqueue_struct *bootlog_wq;
>>> +    /* Synchronizes access of pages in MHI bootlog device */
>>> +    struct mutex            bootlog_mutex;
>>>   };
>>>     struct qaic_drm_device {
>>> diff --git a/drivers/accel/qaic/qaic_debugfs.c 
>>> b/drivers/accel/qaic/qaic_debugfs.c
>>> new file mode 100644
>>> index 000000000000..4f87fe29be1a
>>> --- /dev/null
>>> +++ b/drivers/accel/qaic/qaic_debugfs.c
>>> @@ -0,0 +1,271 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +
>>> +/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
>>> +/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights 
>>> reserved. */
>>> +
>>> +#include <linux/debugfs.h>
>>> +#include <linux/device.h>
>>> +#include <linux/fs.h>
>>> +#include <linux/list.h>
>>> +#include <linux/mhi.h>
>>> +#include <linux/mutex.h>
>>> +#include <linux/pci.h>
>>> +#include <linux/seq_file.h>
>>> +#include <linux/string.h>
>>> +#include <linux/types.h>
>>> +#include <linux/workqueue.h>
>>> +
>>> +#include "qaic.h"
>>> +#include "qaic_debugfs.h"
>>> +
>>> +#define BOOTLOG_POOL_SIZE        16
>>> +#define BOOTLOG_MSG_SIZE        512
>>> +
>>> +struct bootlog_msg {
>>> +    /* Buffer for bootlog messages */
>>> +    char str[BOOTLOG_MSG_SIZE];
>>> +    /* Root struct of device, used to access device resources */
>>> +    struct qaic_device *qdev;
>>> +    /* Work struct to schedule work coming on QAIC_LOGGING channel */
>>> +    struct work_struct work;
>>> +};
>>> +
>>> +struct bootlog_page {
>>> +    /* Node in list of bootlog pages maintained by root device struct */
>>> +    struct list_head node;
>>> +    /* Total size of the buffer that holds the bootlogs. It is PAGE_SIZE */
>>> +    unsigned int size;
>>> +    /* Offset for the next bootlog */
>>> +    unsigned int offset;
>>> +};
>>> +
>>> +static int bootlog_show(struct seq_file *s, void *unused)
>>> +{
>>> +    struct bootlog_page *page;
>>> +    struct qaic_device *qdev;
>>> +    void *page_end;
>>> +    void *log;
>>> +
>>> +    qdev = s->private;
>>> +    mutex_lock(&qdev->bootlog_mutex);
>>> +    list_for_each_entry(page, &qdev->bootlog, node) {
>>> +        log = page + 1;
>>> +        page_end = (void *)page + page->offset;
>>> +        while (log < page_end) {
>>> +            seq_printf(s, "%s", (char *)log);
>>> +            log += strlen(log) + 1;
>>> +        }
>>> +    }
>>> +    mutex_unlock(&qdev->bootlog_mutex);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int bootlog_fops_open(struct inode *inode, struct file *file)
>>> +{
>>> +    return single_open(file, bootlog_show, inode->i_private);
>>> +}
>>> +
>>> +static const struct file_operations bootlog_fops = {
>>> +    .owner = THIS_MODULE,
>>> +    .open = bootlog_fops_open,
>>> +    .read = seq_read,
>>> +    .llseek = seq_lseek,
>>> +    .release = single_release,
>>> +};
>>> +
>>> +void qaic_debugfs_init(struct qaic_drm_device *qddev)
>>> +{
>>> +    struct qaic_device *qdev = qddev->qdev;
>>> +    struct dentry *debugfs_root;
>>> +
>>> +    debugfs_root = to_drm(qddev)->debugfs_root;
>>> +
>>> +    debugfs_create_file("bootlog", 0400, debugfs_root, qdev, 
>>> &bootlog_fops);
>>> +}
>>> +
>>> +static struct bootlog_page *alloc_bootlog_page(struct qaic_device *qdev)
>>> +{
>>> +    struct bootlog_page *page;
>>> +
>>> +    page = (struct bootlog_page *)devm_get_free_pages(&qdev->pdev->dev, 
>>> GFP_KERNEL, 0);
>>> +    if (!page)
>>> +        return page;
>>> +
>>> +    page->size = PAGE_SIZE;
>>> +    page->offset = sizeof(*page);
>>> +    list_add_tail(&page->node, &qdev->bootlog);
>>> +
>>> +    return page;
>>> +}
>>> +
>>> +static int reset_bootlog(struct qaic_device *qdev)
>>> +{
>>> +    struct bootlog_page *page;
>>> +    struct bootlog_page *i;
>>> +
>>> +    list_for_each_entry_safe(page, i, &qdev->bootlog, node) {
>>> +        list_del(&page->node);
>>> +        devm_free_pages(&qdev->pdev->dev, (unsigned long)page);
>>> +    }
>> This is currently dead code. reset is only used to init the bootlog. You may 
>> consider making this init_bootlog() if you are not planning to actually 
>> reset the bootlog.
> 
> No, its not dead code.
> 
> We boot the device the first time.  qaic_bootlog_mhi_probe() is called, which 
> calls reset_bootlog().  This code does not execute as the list is empty.  For 
> this instance, reset_bootlog() is "init_bootlog".  We collect a bootlog and 
> store it in the list.  The device finishes booting, and 
> qaic_bootlog_mhi_remove() is called.  We do not clear the list at that time.  
> This allows the log to be dumped at a later time.
> 
> Now, lets assume the device crashes.  The device will reboot and go through 
> the boot flow again.  In this example, this will be boot instance 2, but this 
> also applies to boot instance N+1.
> 
> qaic_bootlog_mhi_probe() is called again.  Reset_bootlog() will be called, 
> and now this code will execute because the list is non-empty and contains 
> data from the previous boot.  As we are only storing the current bootlog, 
> this loop clears the list and frees the resources. Then we collect the new 
> log for the current boot, and qaic_bootlog_mhi_remove() is triggered again.

OK, make sense.

>>> +
>>> +    page = alloc_bootlog_page(qdev);
>>> +    if (!page)
>>> +        return -ENOMEM;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void *bootlog_get_space(struct qaic_device *qdev, unsigned int size)
>>> +{
>>> +    struct bootlog_page *page;
>>> +
>>> +    page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
>>> +
>>> +    if (size > page->size - sizeof(*page))
>> Not critical but would be safer to use this condition: "sizeof(*page) + size 
>> > page->size"
> 
> I disagree.  Your suggestion would appear to have the potential to overflow 
> because it is doing a calculation based on an untrusted value (the size 
> parameter).  The current code restructures the check to avoid this.
> 
> What would be safer is to utilize size_add(), which I think is better than 
> either the current code, or your suggestion, and is what I will implement.

Yeah, size_add() seems to be the best solution here.

>>
>>> +        return NULL;
>>> +
>>> +    if (page->offset + size > page->size) {
>>> +        page = alloc_bootlog_page(qdev);
>>> +        if (!page)
>>> +            return NULL;
>>> +    }
>>> +
>>> +    return (void *)page + page->offset;
>>> +}
>>> +
>>> +static void bootlog_commit(struct qaic_device *qdev, unsigned int size)
>>> +{
>>> +    struct bootlog_page *page;
>>> +
>>> +    page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
>>> +
>>> +    page->offset += size;
>>> +}
>>> +
>>> +static void bootlog_log(struct work_struct *work)
>>> +{
>>> +    struct bootlog_msg *msg = container_of(work, struct bootlog_msg, work);
>>> +    unsigned int len = strlen(msg->str) + 1;
>>> +    struct qaic_device *qdev = msg->qdev;
>>> +    void *log;
>>> +
>>> +    mutex_lock(&qdev->bootlog_mutex);
>>> +    log = bootlog_get_space(qdev, len);
>>> +    if (log) {
>>> +        memcpy(log, msg, len);
>>> +        bootlog_commit(qdev, len);
>>> +    }
>>> +    mutex_unlock(&qdev->bootlog_mutex);
>>> +
>>> +    if (mhi_queue_buf(qdev->bootlog_ch, DMA_FROM_DEVICE, msg, 
>>> BOOTLOG_MSG_SIZE, MHI_EOT))
>>> +        devm_kfree(&qdev->pdev->dev, msg);
>> You are freeing `struct work` while still in work callback. This is unsafe.
>> See https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2564.
>> Work ptr is kept in busy_hash after the callback has finished and may be 
>> still be accessed.
> 
> Documentation says that is permitted - 
> https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2548
> 
> Also, the framework code documents that the struct work cannot be accessed 
> after the callback is invoked - 
> https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2635

It looks to me as find_worker_executing_work() may access the work data but 
maybe I'm missinterpreteing the code.
Anyway, this would be a kernel bug rather then yours.

>>
>>> +}
>>> +
>>> +static int qaic_bootlog_mhi_probe(struct mhi_device *mhi_dev, const struct 
>>> mhi_device_id *id)
>>> +{
>>> +    struct qaic_device *qdev = 
>>> pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
>>> +    struct bootlog_msg *msg;
>>> +    int i, ret;
>>> +
>>> +    qdev->bootlog_wq = alloc_ordered_workqueue("qaic_bootlog", 0);
>>> +    if (!qdev->bootlog_wq) {
>>> +        ret = -ENOMEM;
>>> +        goto out;
>>> +    }
>>> +
>>> +    mutex_lock(&qdev->bootlog_mutex);
>> Looks like locking should be inside reset_bootlog(), like in other places.
> 
> Will do.
> 
>>
>>> +    ret = reset_bootlog(qdev);
>>> +    mutex_unlock(&qdev->bootlog_mutex);
>>> +    if (ret)
>>> +        goto destroy_workqueue;
>>> +
>>> +    ret = mhi_prepare_for_transfer(mhi_dev);
>>> +    if (ret)
>>> +        goto destroy_workqueue;
>>> +
>>> +    for (i = 0; i < BOOTLOG_POOL_SIZE; i++) {
>>> +        msg = devm_kzalloc(&qdev->pdev->dev, sizeof(*msg), GFP_KERNEL);
>>> +        if (!msg) {
>>> +            ret = -ENOMEM;
>>> +            goto mhi_unprepare;
>>> +        }
>>> +
>>> +        msg->qdev = qdev;
>>> +        INIT_WORK(&msg->work, bootlog_log);
>>> +
>>> +        ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, msg, 
>>> BOOTLOG_MSG_SIZE, MHI_EOT);
>>> +        if (ret)
>>> +            goto mhi_unprepare;
>>> +    }
>>> +
>>> +    dev_set_drvdata(&mhi_dev->dev, qdev);
>>> +    qdev->bootlog_ch = mhi_dev;
>>> +    return 0;
>>> +
>>> +mhi_unprepare:
>>> +    mhi_unprepare_from_transfer(mhi_dev);
>>> +destroy_workqueue:
>>> +    flush_workqueue(qdev->bootlog_wq);
>>> +    destroy_workqueue(qdev->bootlog_wq);
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>> +static void qaic_bootlog_mhi_remove(struct mhi_device *mhi_dev)
>>> +{
>>> +    struct qaic_device *qdev;
>>> +
>>> +    qdev = dev_get_drvdata(&mhi_dev->dev);
>>> +
>>> +    mhi_unprepare_from_transfer(qdev->bootlog_ch);
>>> +    flush_workqueue(qdev->bootlog_wq);
>>> +    destroy_workqueue(qdev->bootlog_wq);
>>> +    qdev->bootlog_ch = NULL;
>>> +}
>>> +
>>> +static void qaic_bootlog_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct 
>>> mhi_result *mhi_result)
>>> +{
>>> +}
>>> +
>>> +static void qaic_bootlog_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct 
>>> mhi_result *mhi_result)
>>> +{
>>> +    struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
>>> +    struct bootlog_msg *msg = mhi_result->buf_addr;
>>> +
>>> +    if (mhi_result->transaction_status) {
>>> +        devm_kfree(&qdev->pdev->dev, msg);
>>> +        return;
>>> +    }
>>> +
>>> +    /* Force a null at the end of the transferred string */
>>> +    msg->str[mhi_result->bytes_xferd - 1] = 0;
>> Is it guaranteed that bytes_xferd will always be within valid range here?
> 
> Yes.  We provide the buffer size when we queue it to MHI.  When the buffer 
> comes back, before this callback, MHI will clamp the transfered size to the 
> buffer size.
> 
>>
>>> +
>>> +    queue_work(qdev->bootlog_wq, &msg->work);
>>> +}
>>> +
>>> +static const struct mhi_device_id qaic_bootlog_mhi_match_table[] = {
>>> +    { .chan = "QAIC_LOGGING", },
>>> +    {},
>>> +};
>>> +
>>> +static struct mhi_driver qaic_bootlog_mhi_driver = {
>>> +    .id_table = qaic_bootlog_mhi_match_table,
>>> +    .remove = qaic_bootlog_mhi_remove,
>>> +    .probe = qaic_bootlog_mhi_probe,
>>> +    .ul_xfer_cb = qaic_bootlog_mhi_ul_xfer_cb,
>>> +    .dl_xfer_cb = qaic_bootlog_mhi_dl_xfer_cb,
>>> +    .driver = {
>>> +        .name = "qaic_bootlog",
>>> +    },
>>> +};
>>> +
>>> +int qaic_bootlog_register(void)
>>> +{
>>> +    return mhi_driver_register(&qaic_bootlog_mhi_driver);
>>> +}
>>> +
>>> +void qaic_bootlog_unregister(void)
>>> +{
>>> +    mhi_driver_unregister(&qaic_bootlog_mhi_driver);
>>> +}
>>> diff --git a/drivers/accel/qaic/qaic_debugfs.h 
>>> b/drivers/accel/qaic/qaic_debugfs.h
>>> new file mode 100644
>>> index 000000000000..ea3fd1a88405
>>> --- /dev/null
>>> +++ b/drivers/accel/qaic/qaic_debugfs.h
>>> @@ -0,0 +1,20 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +
>>> +/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
>>> +/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights 
>>> reserved. */
>>> +
>>> +#ifndef __QAIC_DEBUGFS_H__
>>> +#define __QAIC_DEBUGFS_H__
>>> +
>>> +#include <drm/drm_file.h>
>>> +
>>> +#ifdef CONFIG_DEBUG_FS
>>> +int qaic_bootlog_register(void);
>>> +void qaic_bootlog_unregister(void);
>>> +void qaic_debugfs_init(struct qaic_drm_device *qddev);
>>> +#else
>>> +int qaic_bootlog_register(void) { return 0; }
>>> +void qaic_bootlog_unregister(void) {}
>>> +void qaic_debugfs_init(struct qaic_drm_device *qddev) {}
>>> +#endif /* CONFIG_DEBUG_FS */
>>> +#endif /* __QAIC_DEBUGFS_H__ */
>>> diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c
>>> index d1a632dbaec6..f072edb74f22 100644
>>> --- a/drivers/accel/qaic/qaic_drv.c
>>> +++ b/drivers/accel/qaic/qaic_drv.c
>>> @@ -28,6 +28,7 @@
>>>     #include "mhi_controller.h"
>>>   #include "qaic.h"
>>> +#include "qaic_debugfs.h"
>>>   #include "qaic_timesync.h"
>>>     MODULE_IMPORT_NS(DMA_BUF);
>>> @@ -229,8 +230,12 @@ static int qaic_create_drm_device(struct qaic_device 
>>> *qdev, s32 partition_id)
>>>       qddev->partition_id = partition_id;
>>>         ret = drm_dev_register(drm, 0);
>>> -    if (ret)
>>> +    if (ret) {
>>>           pci_dbg(qdev->pdev, "drm_dev_register failed %d\n", ret);
>>> +        return ret;
>>> +    }
>>> +
>>> +    qaic_debugfs_init(qddev);
>>>         return ret;
>>>   }
>>> @@ -380,6 +385,9 @@ static struct qaic_device *create_qdev(struct pci_dev 
>>> *pdev, const struct pci_de
>>>       if (ret)
>>>           return NULL;
>>>       ret = drmm_mutex_init(drm, &qdev->cntl_mutex);
>>> +    if (ret)
>>> +        return NULL;
>>> +    ret = drmm_mutex_init(drm, &qdev->bootlog_mutex);
>>>       if (ret)
>>>           return NULL;
>>>   @@ -399,6 +407,7 @@ static struct qaic_device *create_qdev(struct pci_dev 
>>> *pdev, const struct pci_de
>>>       qddev->qdev = qdev;
>>>         INIT_LIST_HEAD(&qdev->cntl_xfer_list);
>>> +    INIT_LIST_HEAD(&qdev->bootlog);
>>>       INIT_LIST_HEAD(&qddev->users);
>>>         for (i = 0; i < qdev->num_dbc; ++i) {
>>> @@ -639,6 +648,10 @@ static int __init qaic_init(void)
>>>       if (ret)
>>>           pr_debug("qaic: qaic_timesync_init failed %d\n", ret);
>>>   +    ret = qaic_bootlog_register();
>>> +    if (ret)
>>> +        pr_debug("qaic: qaic_bootlog_register failed %d\n", ret);
>>> +
>>>       return 0;
>>>     free_pci:
>>> @@ -664,6 +677,7 @@ static void __exit qaic_exit(void)
>>>        * reinitializing the link_up state after the cleanup is done.
>>>        */
>>>       link_up = true;
>>> +    qaic_bootlog_unregister();
>>>       qaic_timesync_deinit();
>>>       mhi_driver_unregister(&qaic_mhi_driver);
>>>       pci_unregister_driver(&qaic_pci_driver);
> 

Reviewed-by: Jacek Lawrynowicz <jacek.lawrynow...@linux.intel.com>

Reply via email to