On 27/04/24(Sat) 13:44, Visa Hankala wrote: > On Tue, Apr 23, 2024 at 02:48:32PM +0200, Martin Pieuchot wrote: > > [...] > > I agree. Now I'd be very grateful if someone could dig into WITNESS to > > figure out why we see such reports. Are these false positive or are we > > missing data from the code path that we think are incorrect? > > WITNESS currently cannot show lock cycles longer than two locks. > To fix this, WITNESS needs to do a path search in the lock order graph.
Lovely! > However, there is also something else wrong in WITNESS, possibly > related to situations where the kernel lock comes between two rwlocks > in the lock order. I still need to study this more. I greatly appreciate your dedication in this area. > Below is a patch that adds the cycle search and printing. The patch > also tweaks a few prints to show more context. This is ok mpi@ > With the patch, the nfsnode-vmmaplk reversal looks like this: So the issue here is due to NFS entering the network stack after the VFS. Alexander, Vitaly are we far from a NET_LOCK()-free sosend()? Is something we should consider? On the other side, would that make sense to have a NET_LOCK()-free sysctl path? > witness: lock order reversal: > 1st 0xfffffd8126deacf8 vmmaplk (&map->lock) > 2nd 0xffff800039831948 nfsnode (&np->n_lock) > lock order [1] vmmaplk (&map->lock) -> [2] nfsnode (&np->n_lock) > #0 rw_enter+0x6d > #1 rrw_enter+0x5e > #2 VOP_LOCK+0x5f > #3 vn_lock+0xbc > #4 vn_rdwr+0x83 > #5 vndstrategy+0x2ca > #6 physio+0x204 > #7 spec_write+0x9e > #8 VOP_WRITE+0x6e > #9 vn_write+0x100 > #10 dofilewritev+0x143 > #11 sys_pwrite+0x60 > #12 syscall+0x588 > #13 Xsyscall+0x128 > lock order [2] nfsnode (&np->n_lock) -> [3] netlock (netlock) > #0 rw_enter_read+0x50 > #1 solock_shared+0x3a > #2 sosend+0x10c > #3 nfs_send+0x8d > #4 nfs_request+0x258 > #5 nfs_getattr+0xcb > #6 VOP_GETATTR+0x55 > #7 mountnfs+0x37c > #8 nfs_mount+0x125 > #9 sys_mount+0x343 > #10 syscall+0x561 > #11 Xsyscall+0x128 > lock order [3] netlock (netlock) -> [1] vmmaplk (&map->lock) > #0 rw_enter_read+0x50 > #1 uvmfault_lookup+0x8a > #2 uvm_fault_check+0x36 > #3 uvm_fault+0xfb > #4 kpageflttrap+0x158 > #5 kerntrap+0x94 > #6 alltraps_kern_meltdown+0x7b > #7 _copyin+0x62 > #8 sysctl_bounded_arr+0x83 > #9 tcp_sysctl+0x546 > #10 sys_sysctl+0x17b > #11 syscall+0x561 > #12 Xsyscall+0x128 > > > Index: kern/subr_witness.c > =================================================================== > RCS file: src/sys/kern/subr_witness.c,v > retrieving revision 1.50 > diff -u -p -r1.50 subr_witness.c > --- kern/subr_witness.c 30 May 2023 08:30:01 -0000 1.50 > +++ kern/subr_witness.c 27 Apr 2024 13:08:43 -0000 > @@ -369,6 +369,13 @@ static struct witness_lock_order_data *w > struct witness *child); > static void witness_list_lock(struct lock_instance *instance, > int (*prnt)(const char *fmt, ...)); > +static void witness_print_cycle(int(*prnt)(const char *fmt, ...), > + struct witness *parent, struct witness *child); > +static void witness_print_cycle_edge(int(*prnt)(const char *fmt, ...), > + struct witness *parent, struct witness *child, > + int step, int last); > +static int witness_search(struct witness *w, struct witness *target, > + struct witness **path, int depth, int *remaining); > static void witness_setflag(struct lock_object *lock, int flag, int set); > > /* > @@ -652,8 +659,9 @@ witness_ddb_display_descendants(int(*prn > > for (i = 0; i < indent; i++) > prnt(" "); > - prnt("%s (type: %s, depth: %d)", > - w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level); > + prnt("%s (%s) (type: %s, depth: %d)", > + w->w_subtype, w->w_type->lt_name, > + w->w_class->lc_name, w->w_ddb_level); > if (w->w_displayed) { > prnt(" -- (already displayed)\n"); > return; > @@ -719,7 +727,8 @@ witness_ddb_display(int(*prnt)(const cha > SLIST_FOREACH(w, &w_all, w_list) { > if (w->w_acquired) > continue; > - prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name, > + prnt("%s (%s) (type: %s, depth: %d)\n", > + w->w_subtype, w->w_type->lt_name, > w->w_class->lc_name, w->w_ddb_level); > } > } > @@ -1066,47 +1075,8 @@ witness_checkorder(struct lock_object *l > printf(" 3rd %p %s (%s)\n", lock, > lock->lo_name, w->w_type->lt_name); > } > - if (witness_watch > 1) { > - struct witness_lock_order_data *wlod1, *wlod2; > - > - mtx_enter(&w_mtx); > - wlod1 = witness_lock_order_get(w, w1); > - wlod2 = witness_lock_order_get(w1, w); > - mtx_leave(&w_mtx); > - > - /* > - * It is safe to access saved stack traces, > - * w_type, and w_class without the lock. > - * Once written, they never change. > - */ > - > - if (wlod1 != NULL) { > - printf("lock order \"%s\"(%s) -> " > - "\"%s\"(%s) first seen at:\n", > - w->w_type->lt_name, > - w->w_class->lc_name, > - w1->w_type->lt_name, > - w1->w_class->lc_name); > - stacktrace_print( > - &wlod1->wlod_stack, printf); > - } else { > - printf("lock order data " > - "w2 -> w1 missing\n"); > - } > - if (wlod2 != NULL) { > - printf("lock order \"%s\"(%s) -> " > - "\"%s\"(%s) first seen at:\n", > - w1->w_type->lt_name, > - w1->w_class->lc_name, > - w->w_type->lt_name, > - w->w_class->lc_name); > - stacktrace_print( > - &wlod2->wlod_stack, printf); > - } else { > - printf("lock order data " > - "w1 -> w2 missing\n"); > - } > - } > + if (witness_watch > 1) > + witness_print_cycle(printf, w1, w); > witness_debugger(0); > goto out_splx; > } > @@ -1875,6 +1845,92 @@ witness_list_lock(struct lock_instance * > stacktrace_print(&instance->li_stack->ls_stack, prnt); > } > > +static int > +witness_search(struct witness *w, struct witness *target, > + struct witness **path, int depth, int *remaining) > +{ > + int i, any_remaining; > + > + if (depth == 0) { > + *remaining = 1; > + return (w == target); > + } > + > + any_remaining = 0; > + for (i = 1; i <= w_max_used_index; i++) { > + if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { > + if (witness_search(&w_data[i], target, path, depth - 1, > + remaining)) { > + path[depth - 1] = &w_data[i]; > + *remaining = 1; > + return 1; > + } > + if (remaining) > + any_remaining = 1; > + } > + } > + *remaining = any_remaining; > + return 0; > +} > + > +static void > +witness_print_cycle_edge(int(*prnt)(const char *fmt, ...), > + struct witness *parent, struct witness *child, int step, int last) > +{ > + struct witness_lock_order_data *wlod; > + int next; > + > + if (last) > + next = 1; > + else > + next = step + 1; > + prnt("lock order [%d] %s (%s) -> [%d] %s (%s)\n", > + step, parent->w_subtype, parent->w_type->lt_name, > + next, child->w_subtype, child->w_type->lt_name); > + if (witness_watch > 1) { > + mtx_enter(&w_mtx); > + wlod = witness_lock_order_get(parent, child); > + mtx_leave(&w_mtx); > + > + if (wlod != NULL) > + stacktrace_print(&wlod->wlod_stack, printf); > + else > + prnt("lock order data %p -> %p is missing\n", > + parent->w_type->lt_name, child->w_type->lt_name); > + } > +} > + > +static void > +witness_print_cycle(int(*prnt)(const char *fmt, ...), > + struct witness *parent, struct witness *child) > +{ > + struct witness *path[4]; > + struct witness *w; > + int depth, remaining; > + int step = 0; > + > + /* > + * Use depth-limited search to find the shortest path > + * from child to parent. > + */ > + for (depth = 1; depth < nitems(path); depth++) { > + if (witness_search(child, parent, path, depth, &remaining)) > + goto found; > + if (!remaining) > + break; > + } > + prnt("witness: incomplete path, depth %d\n", depth); > + return; > + > +found: > + witness_print_cycle_edge(prnt, parent, child, ++step, 0); > + for (w = child; depth > 0; depth--) { > + witness_print_cycle_edge(prnt, w, path[depth - 1], ++step, > + depth == 1); > + w = path[depth - 1]; > + } > +} > + > #ifdef DDB > static int > witness_thread_has_locks(struct proc *p) > @@ -2123,9 +2179,6 @@ db_witness_list_all(db_expr_t addr, int > void > witness_print_badstacks(void) > { > - static struct witness tmp_w1, tmp_w2; > - static struct witness_lock_order_data tmp_data1, tmp_data2; > - struct witness_lock_order_data *data1, *data2; > struct witness *w1, *w2; > int error, generation, i, j; > > @@ -2139,11 +2192,6 @@ witness_print_badstacks(void) > } > error = 0; > > - memset(&tmp_w1, 0, sizeof(tmp_w1)); > - memset(&tmp_w2, 0, sizeof(tmp_w2)); > - memset(&tmp_data1, 0, sizeof(tmp_data1)); > - memset(&tmp_data2, 0, sizeof(tmp_data2)); > - > restart: > mtx_enter(&w_mtx); > generation = w_generation; > @@ -2165,12 +2213,9 @@ restart: > mtx_leave(&w_mtx); > continue; > } > - > - /* Copy w1 locally so we can release the spin lock. */ > - tmp_w1 = *w1; > mtx_leave(&w_mtx); > > - if (tmp_w1.w_reversed == 0) > + if (w1->w_reversed == 0) > continue; > for (j = 1; j < w_max_used_index; j++) { > if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) > @@ -2187,47 +2232,13 @@ restart: > } > > w2 = &w_data[j]; > - data1 = witness_lock_order_get(w1, w2); > - data2 = witness_lock_order_get(w2, w1); > - > - /* > - * Copy information locally so we can release the > - * spin lock. > - */ > - tmp_w2 = *w2; > - > - if (data1) > - tmp_data1.wlod_stack = data1->wlod_stack; > - if (data2 && data2 != data1) > - tmp_data2.wlod_stack = data2->wlod_stack; > mtx_leave(&w_mtx); > > db_printf("\nLock order reversal between \"%s\"(%s) " > "and \"%s\"(%s)!\n", > - tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name, > - tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name); > - if (data1) { > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) " > - "first seen at:\n", > - tmp_w1.w_type->lt_name, > - tmp_w1.w_class->lc_name, > - tmp_w2.w_type->lt_name, > - tmp_w2.w_class->lc_name); > - stacktrace_print(&tmp_data1.wlod_stack, > - db_printf); > - db_printf("\n"); > - } > - if (data2 && data2 != data1) { > - db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) " > - "first seen at:\n", > - tmp_w2.w_type->lt_name, > - tmp_w2.w_class->lc_name, > - tmp_w1.w_type->lt_name, > - tmp_w1.w_class->lc_name); > - stacktrace_print(&tmp_data2.wlod_stack, > - db_printf); > - db_printf("\n"); > - } > + w1->w_type->lt_name, w1->w_class->lc_name, > + w2->w_type->lt_name, w2->w_class->lc_name); > + witness_print_cycle(db_printf, w1, w2); > } > } > mtx_enter(&w_mtx);