Problem with ps and kthread: Reproducible crash

2023-05-11 Thread Sebastien Lorquet

Hello,

I have a stm32h7 board, based on the stm32h743zi2 nucleo.

I have activated the watchdog.

The method to do the kthread was copied from this :

https://github.com/apache/nuttx/blob/master/boards/arm/stm32/photon/src/stm32_wdt.c#LL146C7-L146C7

The watchdog works, the system is stable


But now, the nsh 'ps' command ends up with a crash when trying to list 
the kthread that resets the watchdog.



The problem has appeared between current trunk and revision 
13d823f30710e6fabd3d6429a03bc37e1086c9e7



Here is the GDB session, a bit after cmd_ps is entered:

(pid 4 is the watchog maintenance kthread created like in the file above)


open (path=0x3800c770 "/proc/4/cmdline", oflags=1) at vfs/fs_open.c:447
447   if (fd < 0)
(gdb) n
453   leave_cancellation_point();
(gdb)
454   return fd;
(gdb)
455 }
(gdb)
nsh_readfile (vtbl=0x38005688, cmd=0x807ab7c "ps", filepath=0x3800c770 
"/proc/4/cmdline", buffer=0x380056b0 "  0.0%", buflen=512)

    at nsh_fsutils.c:219
219   if (fd < 0)
(gdb) print fd
$1 = 4
(gdb) n
229   ntotal    = 0;  /* No bytes read yet */
(gdb)
230   *buffer   = '\0';   /* NUL terminate the empty buffer */
(gdb)
231   bufptr    = buffer; /* Working pointer */
(gdb)
232   remaining = buflen - 1; /* Reserve one byte for a NUL 
terminator */

(gdb)
233   ret   = ERROR;  /* Assume failure */
(gdb)
237   nread = read(fd, bufptr, remaining);
(gdb) s
read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:166
166   enter_cancellation_point();
(gdb) n
170   ret = nx_read(fd, buf, nbytes);
(gdb) s
nx_read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:132
132   ret = (ssize_t)fs_getfilep(fd, &filep);
(gdb) n
133   if (ret < 0)
(gdb)
140   return file_read(filep, buf, nbytes);
(gdb) s
file_read (filep=0x38002e40, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:67
67    int ret = -EBADF;
(gdb) n
69    DEBUGASSERT(filep);
(gdb)
70    inode = filep->f_inode;
(gdb)
74    if ((filep->f_oflags & O_RDOK) == 0)
(gdb)
85    else if (inode != NULL && inode->u.i_ops && inode->u.i_ops->read)
(gdb)
92    ret = (int)inode->u.i_ops->read(filep,
(gdb) s
procfs_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at 
procfs/fs_procfs.c:445

445   handler = (FAR struct procfs_file_s *)filep->f_priv;
(gdb) n
446   DEBUGASSERT(handler);
(gdb)
450   return handler->procfsentry->ops->read(filep, buffer, buflen);
(gdb) s
proc_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at 
procfs/fs_procfsproc.c:1552

1552  procfile = (FAR struct proc_file_s *)filep->f_priv;
(gdb) n
1553  DEBUGASSERT(procfile != NULL);
(gdb)
1557  tcb = nxsched_get_tcb(procfile->pid);
(gdb)
1558  if (tcb == NULL)
(gdb) print tcb
$2 = (struct tcb_s *) 0x38003f28
(gdb) print *tcb
$3 = {flink = 0x38008640, blink = 0x0, group = 0x38003fe0, pid = 4, 
sched_priority = 100 'd', init_priority = 100 'd',
  start = 0x802fc91 , entry = {pthread = 0x806ecf1 
, main = 0x806ecf1 }, task_state = 6 '\006',
  flags = 4138, lockcount = 0, cpcount = 0, errcode = 0, timeslice = 
200, waitdog = {next = 0x38005a40, arg = 939540264,
    func = 0x802ecdd , lag = 316}, adj_stack_size = 960, 
stack_alloc_ptr = 0x38004328, stack_base_ptr = 0x38004368, waitobj = 0x0,
  sigprocmask = {_elem = {0, 0}}, sigwaitmask = {_elem = {0, 0}}, 
sigpendactionq = {head = 0x0, tail = 0x0}, sigpostedq = {head = 0x0,
    tail = 0x0}, sigunbinfo = {si_signo = 255 '\377', si_code = 2 
'\002', si_errno = 110 'n', si_value = {sival_int = 0, sival_ptr = 0x0},
    si_user = 0x0}, mhead = 0x0, ticks = 0, xcp = {sigdeliver = 0x0, 
saved_regs = 0x0, regs = 0x3800449c},

  name = "watchdog\000\000\000\000\000\000\000\000"}
(gdb) n
1566  switch (procfile->node->node)
(gdb) n
1573  ret = proc_cmdline(procfile, tcb, buffer, buflen, 
filep->f_pos);

(gdb) s
proc_cmdline (procfile=0x3800caa0, tcb=0x38003f28, buffer=0x380056b0 "", 
buflen=511, offset=0) at procfs/fs_procfsproc.c:664

664   remaining = buflen;
(gdb) n
665   totalsize = 0;
(gdb)
670   name   = tcb->name;
(gdb)
674   linesize   = strlen(name);
(gdb)
675   memcpy(procfile->line, name, linesize);
(gdb) print linesize
$4 = 8
(gdb) n
676   copysize   = procfs_memcpy(procfile->line, linesize, buffer, 
remaining,

(gdb) n
679   totalsize += copysize;
(gdb) print copysize
$5 = 8
(gdb) n
680   buffer    += copysize;
(gdb) n
681   remaining -= copysize;
(gdb) n
683   if (totalsize >= buflen)
(gdb) print remaining
$6 = 503
(gdb) n
690   linesize   = group_argvstr(tcb, procfile->line, remaining);
(gdb) s
group_argvstr (tcb=0x38003f28, args=0x3800caac "watchdog", size=503) at 
group/group_argvstr.c:61

61    size_t n = 0;
(gdb) n
68    if (!tcb || !tcb->group || !tcb->group->tg_info)
(gdb)
84    if ((tcb->flags & TCB_FLAG_TTYPE_MASK) == TCB_FLAG_TTYPE_PTHREAD)
(gdb)
93    FAR char **argv = tcb->group->tg_info->argv + 1

Re: Problem with ps and kthread: Reproducible crash

2023-05-11 Thread raiden00pl
Have you tried to increase the stack size for NSH ? Stack overflow in NSH
can cause some really weird hard faults, not that easy to diagnose.


czw., 11 maj 2023 o 18:34 Sebastien Lorquet 
napisał(a):

> Hello,
>
> I have a stm32h7 board, based on the stm32h743zi2 nucleo.
>
> I have activated the watchdog.
>
> The method to do the kthread was copied from this :
>
>
> https://github.com/apache/nuttx/blob/master/boards/arm/stm32/photon/src/stm32_wdt.c#LL146C7-L146C7
>
> The watchdog works, the system is stable
>
>
> But now, the nsh 'ps' command ends up with a crash when trying to list
> the kthread that resets the watchdog.
>
>
> The problem has appeared between current trunk and revision
> 13d823f30710e6fabd3d6429a03bc37e1086c9e7
>
>
> Here is the GDB session, a bit after cmd_ps is entered:
>
> (pid 4 is the watchog maintenance kthread created like in the file above)
>
>
> open (path=0x3800c770 "/proc/4/cmdline", oflags=1) at vfs/fs_open.c:447
> 447   if (fd < 0)
> (gdb) n
> 453   leave_cancellation_point();
> (gdb)
> 454   return fd;
> (gdb)
> 455 }
> (gdb)
> nsh_readfile (vtbl=0x38005688, cmd=0x807ab7c "ps", filepath=0x3800c770
> "/proc/4/cmdline", buffer=0x380056b0 "  0.0%", buflen=512)
>  at nsh_fsutils.c:219
> 219   if (fd < 0)
> (gdb) print fd
> $1 = 4
> (gdb) n
> 229   ntotal= 0;  /* No bytes read yet */
> (gdb)
> 230   *buffer   = '\0';   /* NUL terminate the empty buffer */
> (gdb)
> 231   bufptr= buffer; /* Working pointer */
> (gdb)
> 232   remaining = buflen - 1; /* Reserve one byte for a NUL
> terminator */
> (gdb)
> 233   ret   = ERROR;  /* Assume failure */
> (gdb)
> 237   nread = read(fd, bufptr, remaining);
> (gdb) s
> read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:166
> 166   enter_cancellation_point();
> (gdb) n
> 170   ret = nx_read(fd, buf, nbytes);
> (gdb) s
> nx_read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:132
> 132   ret = (ssize_t)fs_getfilep(fd, &filep);
> (gdb) n
> 133   if (ret < 0)
> (gdb)
> 140   return file_read(filep, buf, nbytes);
> (gdb) s
> file_read (filep=0x38002e40, buf=0x380056b0, nbytes=511) at
> vfs/fs_read.c:67
> 67int ret = -EBADF;
> (gdb) n
> 69DEBUGASSERT(filep);
> (gdb)
> 70inode = filep->f_inode;
> (gdb)
> 74if ((filep->f_oflags & O_RDOK) == 0)
> (gdb)
> 85else if (inode != NULL && inode->u.i_ops && inode->u.i_ops->read)
> (gdb)
> 92ret = (int)inode->u.i_ops->read(filep,
> (gdb) s
> procfs_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at
> procfs/fs_procfs.c:445
> 445   handler = (FAR struct procfs_file_s *)filep->f_priv;
> (gdb) n
> 446   DEBUGASSERT(handler);
> (gdb)
> 450   return handler->procfsentry->ops->read(filep, buffer, buflen);
> (gdb) s
> proc_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at
> procfs/fs_procfsproc.c:1552
> 1552  procfile = (FAR struct proc_file_s *)filep->f_priv;
> (gdb) n
> 1553  DEBUGASSERT(procfile != NULL);
> (gdb)
> 1557  tcb = nxsched_get_tcb(procfile->pid);
> (gdb)
> 1558  if (tcb == NULL)
> (gdb) print tcb
> $2 = (struct tcb_s *) 0x38003f28
> (gdb) print *tcb
> $3 = {flink = 0x38008640, blink = 0x0, group = 0x38003fe0, pid = 4,
> sched_priority = 100 'd', init_priority = 100 'd',
>start = 0x802fc91 , entry = {pthread = 0x806ecf1
> , main = 0x806ecf1 }, task_state = 6 '\006',
>flags = 4138, lockcount = 0, cpcount = 0, errcode = 0, timeslice =
> 200, waitdog = {next = 0x38005a40, arg = 939540264,
>  func = 0x802ecdd , lag = 316}, adj_stack_size = 960,
> stack_alloc_ptr = 0x38004328, stack_base_ptr = 0x38004368, waitobj = 0x0,
>sigprocmask = {_elem = {0, 0}}, sigwaitmask = {_elem = {0, 0}},
> sigpendactionq = {head = 0x0, tail = 0x0}, sigpostedq = {head = 0x0,
>  tail = 0x0}, sigunbinfo = {si_signo = 255 '\377', si_code = 2
> '\002', si_errno = 110 'n', si_value = {sival_int = 0, sival_ptr = 0x0},
>  si_user = 0x0}, mhead = 0x0, ticks = 0, xcp = {sigdeliver = 0x0,
> saved_regs = 0x0, regs = 0x3800449c},
>name = "watchdog\000\000\000\000\000\000\000\000"}
> (gdb) n
> 1566  switch (procfile->node->node)
> (gdb) n
> 1573  ret = proc_cmdline(procfile, tcb, buffer, buflen,
> filep->f_pos);
> (gdb) s
> proc_cmdline (procfile=0x3800caa0, tcb=0x38003f28, buffer=0x380056b0 "",
> buflen=511, offset=0) at procfs/fs_procfsproc.c:664
> 664   remaining = buflen;
> (gdb) n
> 665   totalsize = 0;
> (gdb)
> 670   name   = tcb->name;
> (gdb)
> 674   linesize   = strlen(name);
> (gdb)
> 675   memcpy(procfile->line, name, linesize);
> (gdb) print linesize
> $4 = 8
> (gdb) n
> 676   copysize   = procfs_memcpy(procfile->line, linesize, buffer,
> remaining,
> (gdb) n
> 679   totalsize += copysize;
> (gdb) print copysize
> $5 = 8
> (gdb) n
> 680   buffer+= copysize;
> (gdb) n
> 681   remaining -= copysize;
> (gdb) n
> 683   if (totalsize 

Re: Problem with ps and kthread: Reproducible crash

2023-05-11 Thread Sebastien Lorquet

The stack is a good candidate, I see that I reported

stack_alloc_ptr = 0x38004328, stack_base_ptr = 0x38004368

but at the crash we have :

R2: 38003a1c

SP: 380038c0

I will try that tomorrow, thanks for the idea.


Sebastien

On 5/11/23 19:39, raiden00pl wrote:

Have you tried to increase the stack size for NSH ? Stack overflow in NSH
can cause some really weird hard faults, not that easy to diagnose.


czw., 11 maj 2023 o 18:34 Sebastien Lorquet 
napisał(a):


Hello,

I have a stm32h7 board, based on the stm32h743zi2 nucleo.

I have activated the watchdog.

The method to do the kthread was copied from this :


https://github.com/apache/nuttx/blob/master/boards/arm/stm32/photon/src/stm32_wdt.c#LL146C7-L146C7

The watchdog works, the system is stable


But now, the nsh 'ps' command ends up with a crash when trying to list
the kthread that resets the watchdog.


The problem has appeared between current trunk and revision
13d823f30710e6fabd3d6429a03bc37e1086c9e7


Here is the GDB session, a bit after cmd_ps is entered:

(pid 4 is the watchog maintenance kthread created like in the file above)


open (path=0x3800c770 "/proc/4/cmdline", oflags=1) at vfs/fs_open.c:447
447   if (fd < 0)
(gdb) n
453   leave_cancellation_point();
(gdb)
454   return fd;
(gdb)
455 }
(gdb)
nsh_readfile (vtbl=0x38005688, cmd=0x807ab7c "ps", filepath=0x3800c770
"/proc/4/cmdline", buffer=0x380056b0 "  0.0%", buflen=512)
  at nsh_fsutils.c:219
219   if (fd < 0)
(gdb) print fd
$1 = 4
(gdb) n
229   ntotal= 0;  /* No bytes read yet */
(gdb)
230   *buffer   = '\0';   /* NUL terminate the empty buffer */
(gdb)
231   bufptr= buffer; /* Working pointer */
(gdb)
232   remaining = buflen - 1; /* Reserve one byte for a NUL
terminator */
(gdb)
233   ret   = ERROR;  /* Assume failure */
(gdb)
237   nread = read(fd, bufptr, remaining);
(gdb) s
read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:166
166   enter_cancellation_point();
(gdb) n
170   ret = nx_read(fd, buf, nbytes);
(gdb) s
nx_read (fd=4, buf=0x380056b0, nbytes=511) at vfs/fs_read.c:132
132   ret = (ssize_t)fs_getfilep(fd, &filep);
(gdb) n
133   if (ret < 0)
(gdb)
140   return file_read(filep, buf, nbytes);
(gdb) s
file_read (filep=0x38002e40, buf=0x380056b0, nbytes=511) at
vfs/fs_read.c:67
67int ret = -EBADF;
(gdb) n
69DEBUGASSERT(filep);
(gdb)
70inode = filep->f_inode;
(gdb)
74if ((filep->f_oflags & O_RDOK) == 0)
(gdb)
85else if (inode != NULL && inode->u.i_ops && inode->u.i_ops->read)
(gdb)
92ret = (int)inode->u.i_ops->read(filep,
(gdb) s
procfs_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at
procfs/fs_procfs.c:445
445   handler = (FAR struct procfs_file_s *)filep->f_priv;
(gdb) n
446   DEBUGASSERT(handler);
(gdb)
450   return handler->procfsentry->ops->read(filep, buffer, buflen);
(gdb) s
proc_read (filep=0x38002e40, buffer=0x380056b0 "", buflen=511) at
procfs/fs_procfsproc.c:1552
1552  procfile = (FAR struct proc_file_s *)filep->f_priv;
(gdb) n
1553  DEBUGASSERT(procfile != NULL);
(gdb)
1557  tcb = nxsched_get_tcb(procfile->pid);
(gdb)
1558  if (tcb == NULL)
(gdb) print tcb
$2 = (struct tcb_s *) 0x38003f28
(gdb) print *tcb
$3 = {flink = 0x38008640, blink = 0x0, group = 0x38003fe0, pid = 4,
sched_priority = 100 'd', init_priority = 100 'd',
start = 0x802fc91 , entry = {pthread = 0x806ecf1
, main = 0x806ecf1 }, task_state = 6 '\006',
flags = 4138, lockcount = 0, cpcount = 0, errcode = 0, timeslice =
200, waitdog = {next = 0x38005a40, arg = 939540264,
  func = 0x802ecdd , lag = 316}, adj_stack_size = 960,
stack_alloc_ptr = 0x38004328, stack_base_ptr = 0x38004368, waitobj = 0x0,
sigprocmask = {_elem = {0, 0}}, sigwaitmask = {_elem = {0, 0}},
sigpendactionq = {head = 0x0, tail = 0x0}, sigpostedq = {head = 0x0,
  tail = 0x0}, sigunbinfo = {si_signo = 255 '\377', si_code = 2
'\002', si_errno = 110 'n', si_value = {sival_int = 0, sival_ptr = 0x0},
  si_user = 0x0}, mhead = 0x0, ticks = 0, xcp = {sigdeliver = 0x0,
saved_regs = 0x0, regs = 0x3800449c},
name = "watchdog\000\000\000\000\000\000\000\000"}
(gdb) n
1566  switch (procfile->node->node)
(gdb) n
1573  ret = proc_cmdline(procfile, tcb, buffer, buflen,
filep->f_pos);
(gdb) s
proc_cmdline (procfile=0x3800caa0, tcb=0x38003f28, buffer=0x380056b0 "",
buflen=511, offset=0) at procfs/fs_procfsproc.c:664
664   remaining = buflen;
(gdb) n
665   totalsize = 0;
(gdb)
670   name   = tcb->name;
(gdb)
674   linesize   = strlen(name);
(gdb)
675   memcpy(procfile->line, name, linesize);
(gdb) print linesize
$4 = 8
(gdb) n
676   copysize   = procfs_memcpy(procfile->line, linesize, buffer,
remaining,
(gdb) n
679   totalsize += copysize;
(gdb) print copysize
$5 = 8
(gdb) n
680   buffer+= copysize;
(gdb) n
681   remaining -= copysize;
(gdb) n
683   if (totalsize >= bufl