On Tue, 16 Jun 2026 at 13:52, Shah, Tanmay <[email protected]> wrote:
>
> Hello,
>
> Please find my comments below:
>
> On 6/16/2026 11:01 AM, Mathieu Poirier wrote:
> > On Wed, Jun 10, 2026 at 11:27:11AM -0700, Tanmay Shah wrote:
> >> Remote processor will report the crash reason via the resource table
> >> and notify the host via mailbox notification. The host checks this
> >> crash reason on every mailbox notification from the remote and report
> >> to the rproc core framework. Then the rproc core framework will start
> >> the recovery process.
> >>
> >> Signed-off-by: Tanmay Shah <[email protected]>
> >> ---
> >>
> >> changes in v5
> >>   - use local variable to access crash report pointer
> >>   - End crash report string with '\0' without relying on fw
> >>
> >>  drivers/remoteproc/xlnx_r5_remoteproc.c | 73 ++++++++++++++++++++++++-
> >>  1 file changed, 72 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
> >> b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> index 3349d1877751..86afff9f3b40 100644
> >> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> @@ -112,6 +112,10 @@ struct rsc_tbl_data {
> >>      const uintptr_t rsc_tbl;
> >>  } __packed;
> >>
> >> +enum xlnx_rproc_fw_rsc {
> >> +    XLNX_RPROC_FW_CRASH_REASON = RSC_VENDOR_START,
> >
> > I would call this XLNX_RPROC_FW_CRASH_REPORT
> >
>
> Ack.
>
> >> +};
> >> +
> >>  /*
> >>   * Hardcoded TCM bank values. This will stay in driver to maintain 
> >> backward
> >>   * compatibility with device-tree that does not have TCM information.
> >> @@ -131,9 +135,27 @@ static const struct mem_bank_data 
> >> zynqmp_tcm_banks_lockstep[] = {
> >>      {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
> >>  };
> >>
> >> +#define CRASH_REASON_STR_LEN 16
> >> +
> >> +/**
> >> + * struct xlnx_rproc_crash_report - resource to know crash status and 
> >> reason
> >> + *
> >> + * @version: version of this resource
> >> + * @crash_state: if true, the rproc is notifying crash, time to recover
> >> + * @crash_reason: number to describe reason of crash
> >> + * @crash_reason_str: short string description of crash reason
> >> + */
> >> +struct xlnx_rproc_crash_report {
> >> +    u8 version;
> >> +    u8 crash_state;
> >> +    u8 crash_reason;
> >
> > Do you need 2 variables?  Would it be possible for @crash_reason != 0 to
> > indicate a crash, making @cash_state irrelevant?
> >
>
> Actually initially I had defined crash_reason only, but when I looked at
> how crash reason/type is defined in the kernel, I got idea to keep
> crash_state separate than crash_reason:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git/tree/include/linux/remoteproc.h?h=for-next#n183
>
> enum rproc_crash_type {
>         RPROC_MMUFAULT,
>         RPROC_WATCHDOG,
>         RPROC_FATAL_ERROR,
> };
>
> So, if crash_reason is defined like above, then it's easy to make
> mistake and not define no_crash = 0. Different firmware projects can
> treat crash_reason differently. I would like to keep crash_state and
> crash_reason separate and not impose policy on firmware projects on how
> to define crash_reason.
>

Thanks for the explanation.  I'll think about that a little longer.

> >> +    char crash_reason_str[CRASH_REASON_STR_LEN];
> >> +} __packed;
> >> +
> >>  /**
> >>   * struct zynqmp_r5_core - remoteproc core's internal data
> >>   *
> >> + * @crash_report: rproc crash state and reason
> >>   * @rsc_tbl_va: resource table virtual address
> >>   * @sram: Array of sram memories assigned to this core
> >>   * @num_sram: number of sram for this core
> >> @@ -147,6 +169,7 @@ static const struct mem_bank_data 
> >> zynqmp_tcm_banks_lockstep[] = {
> >>   * @ipi: pointer to mailbox information
> >>   */
> >>  struct zynqmp_r5_core {
> >> +    struct xlnx_rproc_crash_report *crash_report;
> >>      void __iomem *rsc_tbl_va;
> >>      struct zynqmp_sram_bank *sram;
> >>      int num_sram;
> >> @@ -204,11 +227,27 @@ static int event_notified_idr_cb(int id, void *ptr, 
> >> void *data)
> >>   */
> >>  static void handle_event_notified(struct work_struct *work)
> >>  {
> >> +    struct xlnx_rproc_crash_report *report;
> >> +    struct zynqmp_r5_core *r5_core;
> >>      struct mbox_info *ipi;
> >>      struct rproc *rproc;
> >>
> >>      ipi = container_of(work, struct mbox_info, mbox_work);
> >>      rproc = ipi->r5_core->rproc;
> >> +    r5_core = ipi->r5_core;
> >> +    report = r5_core->crash_report;
> >> +
> >> +    /* report crash only if expected */
> >> +    if (report && report->crash_state) {
> >> +            if (rproc->state == RPROC_ATTACHED || rproc->state == 
> >> RPROC_RUNNING) {
> >> +                    report->crash_reason_str[CRASH_REASON_STR_LEN - 1] = 
> >> '\0';
> >> +                    dev_warn(&rproc->dev, "crash reason id: %d %s\n",
> >> +                             report->crash_reason, 
> >> report->crash_reason_str);
> >> +                    rproc_report_crash(rproc, RPROC_FATAL_ERROR);
> >> +                    report->crash_state = false;
> >> +                    return;
> >> +            }
> >> +    }
> >>
> >>      /*
> >>       * We only use IPI for interrupt. The RPU firmware side may or may
> >> @@ -448,6 +487,13 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
> >>      if (ret)
> >>              dev_err(r5_core->dev, "core force power down failed\n");
> >>
> >> +    /*
> >> +     * Clear attach on recovery flag during stop operation. The next state
> >> +     * of the remote processor is expected to be "Running" state. In this
> >> +     * state boot recovery method must take place over attach on recovery.
> >> +     */
> >> +    test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> >
> > Why is there a need to play set/clear the RPROC_FEAT_ATTACH_ON_RECOVERY flag
> > here and in zynqmp_r5_attach() and zynqmp_r5_detach()?  I think we talked 
> > about
> > that before but can't remember the outcome of that conversation.
> >
>
> The remote processor can go through following life cycle:
>
> attach() -> stop() -> start(). When this happens, ATTACH_ON_RECOVERY is
> not valid anymore. At this point, it should be BOOT_RECOVERY which is
> indicated by clearing ATTACH_RECOVERY flag. That is why it is important
> to clear this bit on stop().
>

Here too I'd like to review the current state machine.  I'll get back
to you shortly.

> Now in detach() it is not needed. I am just clearing the flag as part of
> a cleanup sequence.
>
>  >> +
> >>      return ret;
> >>  }
> >>
> >> @@ -869,6 +915,9 @@ static int zynqmp_r5_get_rsc_table_va(struct 
> >> zynqmp_r5_core *r5_core)
> >>
> >>  static int zynqmp_r5_attach(struct rproc *rproc)
> >>  {
> >> +    /* Enable attach on recovery method. Clear it during rproc stop. */
> >> +    rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
> >> +
> >>      dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
> >>
> >>      return 0;
> >> @@ -883,9 +932,30 @@ static int zynqmp_r5_detach(struct rproc *rproc)
> >>       */
> >>      zynqmp_r5_rproc_kick(rproc, 0);
> >>
> >> +    clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> >> +
> >>      return 0;
> >>  }
> >>
> >> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void 
> >> *rsc,
> >> +                            int offset, int avail)
> >> +{
> >> +    struct zynqmp_r5_core *r5_core = rproc->priv;
> >> +    void *rsc_offset = (r5_core->rsc_tbl_va + offset);
> >> +
> >
> >         if (rsc_type != XLNX_RPROC_FW_CRASH_REASON)
> >                 return RSC_IGNORED;
> >
>
> Ack.
>
> >> +    if (rsc_type == XLNX_RPROC_FW_CRASH_REASON) {
> >> +            r5_core->crash_report = rsc_offset;
> >> +            /* reset all values */
> >> +            r5_core->crash_report->crash_state = false;
> >> +            r5_core->crash_report->crash_reason = 0;
> >> +            r5_core->crash_report->crash_reason_str[0] = '\0';
> >> +    } else {
> >> +            return RSC_IGNORED;
> >> +    }
> >> +
> >> +    return RSC_HANDLED;
> >> +}
> >> +
> >>  static const struct rproc_ops zynqmp_r5_rproc_ops = {
> >>      .prepare        = zynqmp_r5_rproc_prepare,
> >>      .unprepare      = zynqmp_r5_rproc_unprepare,
> >> @@ -900,6 +970,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
> >>      .get_loaded_rsc_table = zynqmp_r5_get_loaded_rsc_table,
> >>      .attach         = zynqmp_r5_attach,
> >>      .detach         = zynqmp_r5_detach,
> >> +    .handle_rsc     = zynqmp_r5_handle_rsc,
> >>  };
> >>
> >>  /**
> >> @@ -939,7 +1010,7 @@ static struct zynqmp_r5_core 
> >> *zynqmp_r5_alloc_rproc_core(struct device *cdev)
> >>
> >>      rproc_coredump_set_elf_info(r5_rproc, ELFCLASS32, EM_ARM);
> >>
> >> -    r5_rproc->recovery_disabled = true;
> >> +    r5_rproc->recovery_disabled = false;
> >
> > I'm good with the overall architecture of this set.
> >
>
> Okay. If above comments looks good, I will send v6 soon.
>
> > Regards,
> > Mathieu
> >
> >>      r5_rproc->has_iommu = false;
> >>      r5_rproc->auto_boot = false;
> >>
> >> --
> >> 2.34.1
> >>
>

Reply via email to