Thanks Salvatore to report upstream. Using Chuck Lever advices, I was able to go further in my investigations. I let you send the things you find interesting upstream.
This NFS server hang seems not limited to 6.1 kernels as it appear in some 6.8. But to reproduce the bugs it seems require to have : -> Some NFS client doing some "bad" things... (like backchannel bug) -> Some NFS server load. -> Some NFS server performance (many CPU ? powerfull network cards ?) So bisect between kernel version/patches may no reveal something as the bug may be still present in the 6.12 kernel but not produced by some thread design or inlined functions... When my NFS server hangs, I can see 4 types of messages in system logs : 1) NFSD hangs on nfsd4_destroy_session 2) NFSD hangs on nfsd4_create_session 3) Workqueue: nfsd4 laundromat_main hangs on nfsd4_shutdown_callback 4) Workqueue: nfsd4 nfsd4_state_shrinker_worker hangs on nfsd4_shutdown_callback Here the analysis of each of them. ################################################ 1)---- NFSD hangs on nfsd4_destroy_session ----- ################################################ Some NFSD seems to hangs in nfsd4_destroy_session waiting indefinitely for a "client lock" Mutex to be released. --------- CALL TRACE --------- INFO: task nfsd:1747 blocked for more than 120 seconds. Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:nfsd state:D stack:0 pid:1747 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x34d/0x9e0 schedule+0x5a/0xd0 schedule_timeout+0x118/0x150 wait_for_completion+0x86/0x160 __flush_workqueue+0x152/0x420 nfsd4_destroy_session+0x1b6/0x250 [nfsd] nfsd4_proc_compound+0x34c/0x660 [nfsd] nfsd_dispatch+0x1a1/0x2b0 [nfsd] svc_process_common+0x295/0x610 [sunrpc] ? svc_recv+0x491/0x810 [sunrpc] ? nfsd_svc+0x3c0/0x3c0 [nfsd] ? nfsd_inet6addr_event+0x130/0x130 [nfsd] svc_process+0xad/0x100 [sunrpc] nfsd+0x99/0xc0 [nfsd] kthread+0xda/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ------- END CALL TRACE ------- ------ CALL DECODING --------- ./scripts/faddr2line ./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.o nfsd4_destroy_session+0x1b6 nfsd4_destroy_session+0x1b6/0x250: spin_lock à debian/build/build_amd64_none_amd64/include/linux/spinlock.h:351 (inlined by) nfsd4_destroy_session à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:3861 ------ END CALL DECODING --------- ----- SOURCE REFERENCE ------- ##################### #### nfs4state.c: ### ##################### __be32 nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid; struct nfsd4_session *ses; __be32 status; int ref_held_by_me = 0; struct net *net = SVC_NET(r); struct nfsd_net *nn = net_generic(net, nfsd_net_id); status = nfserr_not_only_op; if (nfsd4_compound_in_session(cstate, sessionid)) { if (!nfsd4_last_compound_op(r)) goto out; ref_held_by_me++; } dump_sessionid(__func__, sessionid); spin_lock(&nn->client_lock); ses = find_in_sessionid_hashtbl(sessionid, net, &status); if (!ses) goto out_client_lock; status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) goto out_put_session; unhash_session(ses); spin_unlock(&nn->client_lock); nfsd4_probe_callback_sync(ses->se_client); ---- HERE-> spin_lock(&nn->client_lock); ---- status = nfs_ok; out_put_session: nfsd4_put_session_locked(ses); out_client_lock: spin_unlock(&nn->client_lock); out: return status; } ----- END SOURCE REFERENCE ------- ################################################ 2)---- NFSD hangs on nfsd4_create_session ----- ################################################ Same thing for some other NFSD in nfsd4_create_session waiting indefinitely for the same "client lock" Mutex to be released. --------- CALL TRACE --------- INFO: task nfsd:1740 blocked for more than 120 seconds. Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:nfsd state:D stack:0 pid:1740 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x34d/0x9e0 schedule+0x5a/0xd0 schedule_timeout+0x118/0x150 wait_for_completion+0x86/0x160 __flush_workqueue+0x152/0x420 nfsd4_create_session+0x79f/0xba0 [nfsd] nfsd4_proc_compound+0x34c/0x660 [nfsd] nfsd_dispatch+0x1a1/0x2b0 [nfsd] svc_process_common+0x295/0x610 [sunrpc] ? svc_recv+0x491/0x810 [sunrpc] ? nfsd_svc+0x3c0/0x3c0 [nfsd] ? nfsd_inet6addr_event+0x130/0x130 [nfsd] svc_process+0xad/0x100 [sunrpc] nfsd+0x99/0xc0 [nfsd] kthread+0xda/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ------- END CALL TRACE ------- ------ CALL DECODING --------- ./scripts/faddr2line ./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.o nfsd4_create_session+0x79f nfsd4_create_session+0x79f/0xba0: nfsd4_put_session à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:263 (inlined by) nfsd4_create_session à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:3681 ---- END CALL DECODING ------- ----- SOURCE REFERENCE ------- ##################### #### nfs4state.c: ### ##################### __be32 nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; struct nfsd4_clid_slot *cs_slot = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) return nfserr_inval; status = nfsd4_check_cb_sec(&cr_ses->cb_sec); if (status) return status; status = check_forechannel_attrs(&cr_ses->fore_channel, nn); if (status) return status; status = check_backchannel_attrs(&cr_ses->back_channel); if (status) goto out_release_drc_mem; status = nfserr_jukebox; new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel); if (!new) goto out_release_drc_mem; conn = alloc_conn_from_crses(rqstp, cr_ses); if (!conn) goto out_free_session; spin_lock(&nn->client_lock); unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); WARN_ON_ONCE(conf && unconf); if (conf) { status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { if (status == nfserr_replay_cache) status = nfsd4_replay_create_session(cr_ses, cs_slot); goto out_free_conn; } } else if (unconf) { status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out_free_conn; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_free_conn; cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); if (status) { /* an unconfirmed replay returns misordered */ status = nfserr_seq_misordered; goto out_free_conn; } old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); if (status) { old = NULL; goto out_free_conn; } trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } else { status = nfserr_stale_clientid; goto out_free_conn; } status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; /* Upshifting from TCP to RDMA is not supported */ cr_ses->flags &= ~SESSION4_RDMA; init_session(rqstp, new, conf, cr_ses); nfsd4_get_session_locked(new); memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); cs_slot->sl_seqid++; cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); if (conf == unconf) fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); ---- HERE-> nfsd4_put_session(new); ---- if (old) expire_client(old); return status; out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); if (old) expire_client(old); out_free_session: __free_session(new); out_release_drc_mem: nfsd4_put_drc_mem(&cr_ses->fore_channel); return status; } static void nfsd4_put_session(struct nfsd4_session *ses) { --- struct nfs4_client *clp = ses->se_client; 263-> struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); --- ---- HERE-> spin_lock(&nn->client_lock); ---- nfsd4_put_session_locked(ses); spin_unlock(&nn->client_lock); } ################################################ 3)---- Workqueue: nfsd4 laundromat_main hangs on nfsd4_shutdown_callback ################################################ nfsd4_shutdown_callback in nfs4callback.c is only called from __destroy_client in nfs4state.c. One comment say that nfsd4_shutdown_callback must be called under "state lock". But in __destroy_client the lock is released before. Maybe because there is first a copy of the instance ? static void __destroy_client(struct nfs4_client *clp) { ... INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); ... spin_unlock(&state_lock); ... nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_copy(clp); nfsd4_shutdown_callback(clp); After the nfsd4_shutdown_callback call, the workqueue seems to hangs on : flush_workqueue(callback_wq) or nfsd41_cb_inflight_wait_complete(clp) I don't know if the line reference is reliable. --------- CALL TRACE --------- INFO: task kworker/u96:2:39983 blocked for more than 120 seconds. Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u96:2 state:D stack:0 pid:39983 ppid:2 flags:0x00004000 Workqueue: nfsd4 laundromat_main [nfsd] Call Trace: <TASK> __schedule+0x34d/0x9e0 schedule+0x5a/0xd0 schedule_timeout+0x118/0x150 wait_for_completion+0x86/0x160 __flush_workqueue+0x152/0x420 nfsd4_shutdown_callback+0x49/0x130 [nfsd] ? _raw_spin_unlock+0xa/0x30 ? nfsd4_return_all_client_layouts+0xc4/0xf0 [nfsd] ? nfsd4_shutdown_copy+0x28/0x130 [nfsd] __destroy_client+0x1f3/0x290 [nfsd] nfs4_process_client_reaplist+0xa2/0x110 [nfsd] laundromat_main+0x1ce/0x880 [nfsd] process_one_work+0x1c7/0x380 worker_thread+0x4d/0x380 ? rescuer_thread+0x3a0/0x3a0 kthread+0xda/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ------- END CALL TRACE ------- ------ CALL DECODING --------- ./scripts/faddr2line ./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.o nfsd4_shutdown_callback+0x49 nfsd4_shutdown_callback+0x49/0x12f: might_resched à debian/build/build_amd64_none_amd64/include/linux/kernel.h:110 (en ligne par) nfsd41_cb_inflight_wait_complete à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.c:931 (en ligne par) nfsd4_shutdown_callback à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.c:1319 ---- END CALL DECODING ------- ----- SOURCE REFERENCE ------- ##################### #### nfs4state.c: ### ##################### static void __destroy_client(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; INIT_LIST_HEAD(&reaplist); spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); destroy_unhashed_deleg(dp); } while (!list_empty(&clp->cl_revoked)) { dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } for (i = 0; i < OWNER_HASH_SIZE; i++) { struct nfs4_stateowner *so, *tmp; list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], so_strhash) { /* Should be no openowners at this point */ WARN_ON_ONCE(so->so_is_open_owner); remove_blocked_locks(lockowner(so)); } } nfsd4_return_all_client_layouts(clp); ---- nfsd4_shutdown_copy(clp); HERE-> nfsd4_shutdown_callback(clp); ---- if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); atomic_add_unless(&nn->nfs4_client_count, -1, 0); nfsd4_dec_courtesy_client_count(nn, clp); free_client(clp); wake_up_all(&expiry_wq); } ##################### #### nfs4state.c: ### ##################### /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { if (clp->cl_cb_state != NFSD4_CB_UNKNOWN) trace_nfsd_cb_shutdown(clp); set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; * instead, nfsd4_run_cb_null() will detect the killed * client, destroy the rpc client, and stop: */ nfsd4_run_cb(&clp->cl_cb_null); ????-> flush_workqueue(callback_wq); 1319-> nfsd41_cb_inflight_wait_complete(clp); } static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) { HERE-> wait_var_event(&clp->cl_cb_inflight, !atomic_read(&clp->cl_cb_inflight)); } --- END SOURCE REFERENCE ----- ################################################ 4)---- Workqueue: nfsd4 nfsd4_state_shrinker_worker hangs on nfsd4_shutdown_callback ################################################ Same thing with the nfsd4_state_shrinker_worker workqueue. So may be the hangs caused by the simultaneous call of nfsd4_shutdown_callback by the two workqueues ? nfsd4 laundromat_main and nfsd4_state_shrinker_worker Note the presence of "_raw_spin_unlock" in the call trace. --------- CALL TRACE --------- task:kworker/u96:3 state:D stack:0 pid:40084 ppid:2 flags:0x00004000 Workqueue: nfsd4 nfsd4_state_shrinker_worker [nfsd] Call Trace: <TASK> __schedule+0x34d/0x9e0 schedule+0x5a/0xd0 schedule_timeout+0x118/0x150 wait_for_completion+0x86/0x160 __flush_workqueue+0x152/0x420 nfsd4_shutdown_callback+0x49/0x130 [nfsd] ? _raw_spin_unlock+0xa/0x30 ? nfsd4_return_all_client_layouts+0xc4/0xf0 [nfsd] ? nfsd4_shutdown_copy+0x28/0x130 [nfsd] __destroy_client+0x1f3/0x290 [nfsd] nfs4_process_client_reaplist+0xa2/0x110 [nfsd] nfsd4_state_shrinker_worker+0xf7/0x320 [nfsd] process_one_work+0x1c7/0x380 worker_thread+0x4d/0x380 ? rescuer_thread+0x3a0/0x3a0 kthread+0xda/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ------- END CALL TRACE ------- Regards, Baptiste.