Dear tech,

I have a machine with some kind of hardware defect.
Smartctl shows that one SSD regularly has an unexpected power loss
(Attribute 174):

174 Unknown_Attribute       0x0032   100   100   000    Old_age
Always       -       284.

On the console I see the following output:

root on sd0a (b75e60dd651bd99e.a) swap on sd0b dump on sd0b
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
ahci0: unrecoverable errors (IS: 10<UFS>), disabling port.
ahci0: device didn't come ready after reset, TFD: 0x1d0<BSY>
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: stopping the port, softreset slot 31 was still active.
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
ahci0: device didn't come ready after reset, TFD: 0x441<ERR>
mode = 0100644, inum = 7731315, fs = /var
panic: ffs_valloc: dup alloc
Stopped at      db_enter+0x10:  popq    %rbp
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
 175271  46390   1001         0x2  0x4000000    0  rustc
 345274  46390   1001         0x2  0x4000000    1  rustc
 224769  52321   1001         0x2  0x4000000    5  rustc
*414814  59107   1001      0x1002  0x4080000    2K rustc
  87951  59107   1001      0x1002  0x4080000    4  rustc
db_enter() at db_enter+0x10
panic(ffffffff81f194f6) at panic+0xbf
ffs_inode_alloc(fffffd87b0b8c1e0,81a4,fffffd8fc6cfc5d0,ffff800020a7a948) at ffs
_inode_alloc+0x42e
ufs_makeinode(81a4,fffffd87ab540438,ffff800020a7ac40,ffff800020a7ac70) at ufs_m
akeinode+0x79
ufs_create(ffff800020a7a9f8) at ufs_create+0x3c
VOP_CREATE(fffffd87ab540438,ffff800020a7ac40,ffff800020a7ac70,ffff800020a7aa50)
 at VOP_CREATE+0x3f
vn_open(ffff800020a7ac10,10602,1a4) at vn_open+0x162
doopenat(ffff800020879cf0,ffffff9c,246e5646200,10601,1b6,ffff800020a7adf0) at d
oopenat+0x1cd
syscall(ffff800020a7ae60) at syscall+0x35f
Xsyscall() at Xsyscall+0x128
end of kernel

ffs_inode_alloc checks the mode of the newly allocated inode and
if it is not 0 it panics.
I suspect that due to the ssds softreset that IO operations take
longer and cause trouble due to missing locking.

For example in ffs_nodealloccg it is possible for an FFS2 file
system to sleep due to a getblk. During the sleep, another call to
ffs_nodealloccg might happen with the same ipref.
The function first checks if there is space at the prefered location,
then sleeps and then reserves it. This is a TOCTOU bug.

FreeBSD fixed this in 2013:
https://github.com/freebsd/freebsd-src/commit/94f4ac214c403b99cbc44dd6e9cdf4db2cc9b297

Below is a diff that addresses this with a exclusive lock
using RW_SLEEPFAIL.

Index: ufs/ffs/ffs_alloc.c
===================================================================
RCS file: /cvs/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.114
diff -u -p -r1.114 ffs_alloc.c
--- ufs/ffs/ffs_alloc.c 11 Mar 2021 13:31:35 -0000      1.114
+++ ufs/ffs/ffs_alloc.c 10 Mar 2023 15:44:52 -0000
@@ -46,6 +46,7 @@
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/stdint.h>
 #include <sys/time.h>
@@ -1089,6 +1090,7 @@ gotit:
 }
 
 /* inode allocation routine */
+struct rwlock ffs_node_lock = RWLOCK_INITIALIZER("indallck");
 daddr_t
 ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode)
 {
@@ -1115,6 +1117,9 @@ ffs_nodealloccg(struct inode *ip, u_int 
                return (0);
 
        cgp = (struct cg *)bp->b_data;
+#ifdef FFS2
+ check_nifree:
+#endif
        if (cgp->cg_cs.cs_nifree == 0) {
                brelse(bp);
                return (0);
@@ -1201,9 +1206,12 @@ gotit:
            /* Has any inode not been used at least once? */
            cgp->cg_initediblk < cgp->cg_ffs2_niblk) {
 
+               if (rw_enter(&ffs_node_lock, RW_WRITE | RW_SLEEPFAIL))
+                       goto check_nifree;
                 ibp = getblk(ip->i_devvp, fsbtodb(fs,
                     ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
                     (int)fs->fs_bsize, 0, INFSLP);
+               rw_exit(&ffs_node_lock);
 
                 memset(ibp->b_data, 0, fs->fs_bsize);
                 dp2 = (struct ufs2_dinode *)(ibp->b_data);

Reply via email to