Hello, Please find my comments below:
On 6/16/2026 11:01 AM, Mathieu Poirier wrote: > On Wed, Jun 10, 2026 at 11:27:11AM -0700, Tanmay Shah wrote: >> Remote processor will report the crash reason via the resource table >> and notify the host via mailbox notification. The host checks this >> crash reason on every mailbox notification from the remote and report >> to the rproc core framework. Then the rproc core framework will start >> the recovery process. >> >> Signed-off-by: Tanmay Shah <[email protected]> >> --- >> >> changes in v5 >> - use local variable to access crash report pointer >> - End crash report string with '\0' without relying on fw >> >> drivers/remoteproc/xlnx_r5_remoteproc.c | 73 ++++++++++++++++++++++++- >> 1 file changed, 72 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c >> b/drivers/remoteproc/xlnx_r5_remoteproc.c >> index 3349d1877751..86afff9f3b40 100644 >> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c >> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c >> @@ -112,6 +112,10 @@ struct rsc_tbl_data { >> const uintptr_t rsc_tbl; >> } __packed; >> >> +enum xlnx_rproc_fw_rsc { >> + XLNX_RPROC_FW_CRASH_REASON = RSC_VENDOR_START, > > I would call this XLNX_RPROC_FW_CRASH_REPORT > Ack. >> +}; >> + >> /* >> * Hardcoded TCM bank values. This will stay in driver to maintain backward >> * compatibility with device-tree that does not have TCM information. >> @@ -131,9 +135,27 @@ static const struct mem_bank_data >> zynqmp_tcm_banks_lockstep[] = { >> {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"}, >> }; >> >> +#define CRASH_REASON_STR_LEN 16 >> + >> +/** >> + * struct xlnx_rproc_crash_report - resource to know crash status and reason >> + * >> + * @version: version of this resource >> + * @crash_state: if true, the rproc is notifying crash, time to recover >> + * @crash_reason: number to describe reason of crash >> + * @crash_reason_str: short string description of crash reason >> + */ >> +struct xlnx_rproc_crash_report { >> + u8 version; >> + u8 crash_state; >> + u8 crash_reason; > > Do you need 2 variables? Would it be possible for @crash_reason != 0 to > indicate a crash, making @cash_state irrelevant? > Actually initially I had defined crash_reason only, but when I looked at how crash reason/type is defined in the kernel, I got idea to keep crash_state separate than crash_reason: https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git/tree/include/linux/remoteproc.h?h=for-next#n183 enum rproc_crash_type { RPROC_MMUFAULT, RPROC_WATCHDOG, RPROC_FATAL_ERROR, }; So, if crash_reason is defined like above, then it's easy to make mistake and not define no_crash = 0. Different firmware projects can treat crash_reason differently. I would like to keep crash_state and crash_reason separate and not impose policy on firmware projects on how to define crash_reason. >> + char crash_reason_str[CRASH_REASON_STR_LEN]; >> +} __packed; >> + >> /** >> * struct zynqmp_r5_core - remoteproc core's internal data >> * >> + * @crash_report: rproc crash state and reason >> * @rsc_tbl_va: resource table virtual address >> * @sram: Array of sram memories assigned to this core >> * @num_sram: number of sram for this core >> @@ -147,6 +169,7 @@ static const struct mem_bank_data >> zynqmp_tcm_banks_lockstep[] = { >> * @ipi: pointer to mailbox information >> */ >> struct zynqmp_r5_core { >> + struct xlnx_rproc_crash_report *crash_report; >> void __iomem *rsc_tbl_va; >> struct zynqmp_sram_bank *sram; >> int num_sram; >> @@ -204,11 +227,27 @@ static int event_notified_idr_cb(int id, void *ptr, >> void *data) >> */ >> static void handle_event_notified(struct work_struct *work) >> { >> + struct xlnx_rproc_crash_report *report; >> + struct zynqmp_r5_core *r5_core; >> struct mbox_info *ipi; >> struct rproc *rproc; >> >> ipi = container_of(work, struct mbox_info, mbox_work); >> rproc = ipi->r5_core->rproc; >> + r5_core = ipi->r5_core; >> + report = r5_core->crash_report; >> + >> + /* report crash only if expected */ >> + if (report && report->crash_state) { >> + if (rproc->state == RPROC_ATTACHED || rproc->state == >> RPROC_RUNNING) { >> + report->crash_reason_str[CRASH_REASON_STR_LEN - 1] = >> '\0'; >> + dev_warn(&rproc->dev, "crash reason id: %d %s\n", >> + report->crash_reason, >> report->crash_reason_str); >> + rproc_report_crash(rproc, RPROC_FATAL_ERROR); >> + report->crash_state = false; >> + return; >> + } >> + } >> >> /* >> * We only use IPI for interrupt. The RPU firmware side may or may >> @@ -448,6 +487,13 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc) >> if (ret) >> dev_err(r5_core->dev, "core force power down failed\n"); >> >> + /* >> + * Clear attach on recovery flag during stop operation. The next state >> + * of the remote processor is expected to be "Running" state. In this >> + * state boot recovery method must take place over attach on recovery. >> + */ >> + test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features); > > Why is there a need to play set/clear the RPROC_FEAT_ATTACH_ON_RECOVERY flag > here and in zynqmp_r5_attach() and zynqmp_r5_detach()? I think we talked > about > that before but can't remember the outcome of that conversation. > The remote processor can go through following life cycle: attach() -> stop() -> start(). When this happens, ATTACH_ON_RECOVERY is not valid anymore. At this point, it should be BOOT_RECOVERY which is indicated by clearing ATTACH_RECOVERY flag. That is why it is important to clear this bit on stop(). Now in detach() it is not needed. I am just clearing the flag as part of a cleanup sequence. >> + >> return ret; >> } >> >> @@ -869,6 +915,9 @@ static int zynqmp_r5_get_rsc_table_va(struct >> zynqmp_r5_core *r5_core) >> >> static int zynqmp_r5_attach(struct rproc *rproc) >> { >> + /* Enable attach on recovery method. Clear it during rproc stop. */ >> + rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY); >> + >> dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index); >> >> return 0; >> @@ -883,9 +932,30 @@ static int zynqmp_r5_detach(struct rproc *rproc) >> */ >> zynqmp_r5_rproc_kick(rproc, 0); >> >> + clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features); >> + >> return 0; >> } >> >> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void >> *rsc, >> + int offset, int avail) >> +{ >> + struct zynqmp_r5_core *r5_core = rproc->priv; >> + void *rsc_offset = (r5_core->rsc_tbl_va + offset); >> + > > if (rsc_type != XLNX_RPROC_FW_CRASH_REASON) > return RSC_IGNORED; > Ack. >> + if (rsc_type == XLNX_RPROC_FW_CRASH_REASON) { >> + r5_core->crash_report = rsc_offset; >> + /* reset all values */ >> + r5_core->crash_report->crash_state = false; >> + r5_core->crash_report->crash_reason = 0; >> + r5_core->crash_report->crash_reason_str[0] = '\0'; >> + } else { >> + return RSC_IGNORED; >> + } >> + >> + return RSC_HANDLED; >> +} >> + >> static const struct rproc_ops zynqmp_r5_rproc_ops = { >> .prepare = zynqmp_r5_rproc_prepare, >> .unprepare = zynqmp_r5_rproc_unprepare, >> @@ -900,6 +970,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = { >> .get_loaded_rsc_table = zynqmp_r5_get_loaded_rsc_table, >> .attach = zynqmp_r5_attach, >> .detach = zynqmp_r5_detach, >> + .handle_rsc = zynqmp_r5_handle_rsc, >> }; >> >> /** >> @@ -939,7 +1010,7 @@ static struct zynqmp_r5_core >> *zynqmp_r5_alloc_rproc_core(struct device *cdev) >> >> rproc_coredump_set_elf_info(r5_rproc, ELFCLASS32, EM_ARM); >> >> - r5_rproc->recovery_disabled = true; >> + r5_rproc->recovery_disabled = false; > > I'm good with the overall architecture of this set. > Okay. If above comments looks good, I will send v6 soon. > Regards, > Mathieu > >> r5_rproc->has_iommu = false; >> r5_rproc->auto_boot = false; >> >> -- >> 2.34.1 >>

