Thanks Salvatore to report upstream.

Using Chuck Lever advices, I was able to go further in my 
investigations. I let you send the things you find 
interesting upstream.

This NFS server hang seems not limited to 6.1 kernels as it appear
in some 6.8. But to reproduce the bugs it seems require to have :

-> Some NFS client doing some "bad" things... (like backchannel bug)
-> Some NFS server load.
-> Some NFS server performance (many CPU ? powerfull network cards ?)

So bisect between kernel version/patches may no reveal something as
the bug may be still present in the 6.12 kernel but not produced
by some thread design or inlined functions...

When my NFS server hangs, I can see 4 types of messages in system logs :

1) NFSD hangs on nfsd4_destroy_session
2) NFSD hangs on nfsd4_create_session
3) Workqueue: nfsd4 laundromat_main hangs on nfsd4_shutdown_callback
4) Workqueue: nfsd4 nfsd4_state_shrinker_worker hangs on nfsd4_shutdown_callback

Here the analysis of each of them.


################################################
1)---- NFSD hangs on nfsd4_destroy_session -----
################################################

Some NFSD seems to hangs in nfsd4_destroy_session waiting 
indefinitely for a "client lock" Mutex to be released.

--------- CALL TRACE ---------
INFO: task nfsd:1747 blocked for more than 120 seconds.
 Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:nfsd state:D stack:0 pid:1747 ppid:2 flags:0x00004000
Call Trace:
<TASK>
__schedule+0x34d/0x9e0
schedule+0x5a/0xd0
schedule_timeout+0x118/0x150
wait_for_completion+0x86/0x160
__flush_workqueue+0x152/0x420
nfsd4_destroy_session+0x1b6/0x250 [nfsd]
nfsd4_proc_compound+0x34c/0x660 [nfsd]
nfsd_dispatch+0x1a1/0x2b0 [nfsd]
svc_process_common+0x295/0x610 [sunrpc]
? svc_recv+0x491/0x810 [sunrpc]
? nfsd_svc+0x3c0/0x3c0 [nfsd]
? nfsd_inet6addr_event+0x130/0x130 [nfsd]
svc_process+0xad/0x100 [sunrpc]
nfsd+0x99/0xc0 [nfsd]
kthread+0xda/0x100
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x22/0x30
</TASK>
------- END CALL TRACE -------


------ CALL DECODING ---------
./scripts/faddr2line ./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.o 
nfsd4_destroy_session+0x1b6
nfsd4_destroy_session+0x1b6/0x250:
spin_lock à debian/build/build_amd64_none_amd64/include/linux/spinlock.h:351
(inlined by) nfsd4_destroy_session à 
debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:3861
------ END CALL DECODING ---------


----- SOURCE REFERENCE -------

#####################
#### nfs4state.c: ###
#####################

__be32
nfsd4_destroy_session(struct svc_rqst *r, struct nfsd4_compound_state *cstate,
 union nfsd4_op_u *u)
{
 struct nfs4_sessionid *sessionid = &u->destroy_session.sessionid;
 struct nfsd4_session *ses;
 __be32 status;
 int ref_held_by_me = 0;
 struct net *net = SVC_NET(r);
 struct nfsd_net *nn = net_generic(net, nfsd_net_id);

 status = nfserr_not_only_op;
 if (nfsd4_compound_in_session(cstate, sessionid)) {
 if (!nfsd4_last_compound_op(r))
 goto out;
 ref_held_by_me++;
 }
 dump_sessionid(__func__, sessionid);
 spin_lock(&nn->client_lock);
 ses = find_in_sessionid_hashtbl(sessionid, net, &status);
 if (!ses)
 goto out_client_lock;
 status = nfserr_wrong_cred;
 if (!nfsd4_mach_creds_match(ses->se_client, r))
 goto out_put_session;
 status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
 if (status)
 goto out_put_session;
 unhash_session(ses);
 spin_unlock(&nn->client_lock);

 nfsd4_probe_callback_sync(ses->se_client);
----
HERE-> spin_lock(&nn->client_lock);
---- status = nfs_ok;
out_put_session:
 nfsd4_put_session_locked(ses);
out_client_lock:
 spin_unlock(&nn->client_lock);
out:
 return status;
}


----- END SOURCE REFERENCE -------



################################################
2)---- NFSD hangs on nfsd4_create_session -----
################################################

Same thing for some other NFSD in nfsd4_create_session waiting 
indefinitely for the same "client lock" Mutex to be released.

--------- CALL TRACE ---------
INFO: task nfsd:1740 blocked for more than 120 seconds.
Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:nfsd state:D stack:0 pid:1740 ppid:2 flags:0x00004000
Call Trace:
<TASK>
__schedule+0x34d/0x9e0
schedule+0x5a/0xd0
schedule_timeout+0x118/0x150
wait_for_completion+0x86/0x160
__flush_workqueue+0x152/0x420
nfsd4_create_session+0x79f/0xba0 [nfsd]
nfsd4_proc_compound+0x34c/0x660 [nfsd]
nfsd_dispatch+0x1a1/0x2b0 [nfsd]
svc_process_common+0x295/0x610 [sunrpc]
? svc_recv+0x491/0x810 [sunrpc]
? nfsd_svc+0x3c0/0x3c0 [nfsd]
? nfsd_inet6addr_event+0x130/0x130 [nfsd]
svc_process+0xad/0x100 [sunrpc]
nfsd+0x99/0xc0 [nfsd]
kthread+0xda/0x100
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x22/0x30
</TASK>
------- END CALL TRACE -------


------ CALL DECODING ---------
./scripts/faddr2line ./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.o 
nfsd4_create_session+0x79f
nfsd4_create_session+0x79f/0xba0:
nfsd4_put_session à debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:263
(inlined by) nfsd4_create_session à 
debian/build/build_amd64_none_amd64/fs/nfsd/nfs4state.c:3681
---- END CALL DECODING -------


----- SOURCE REFERENCE -------

#####################
#### nfs4state.c: ###
#####################

__be32
nfsd4_create_session(struct svc_rqst *rqstp,
 struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
{
 struct nfsd4_create_session *cr_ses = &u->create_session;
 struct sockaddr *sa = svc_addr(rqstp);
 struct nfs4_client *conf, *unconf;
 struct nfs4_client *old = NULL;
 struct nfsd4_session *new;
 struct nfsd4_conn *conn;
 struct nfsd4_clid_slot *cs_slot = NULL;
 __be32 status = 0;
 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);

 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 return nfserr_inval;
 status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
 if (status)
 return status;
 status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
 if (status)
 return status;
 status = check_backchannel_attrs(&cr_ses->back_channel);
 if (status)
 goto out_release_drc_mem;
 status = nfserr_jukebox;
 new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
 if (!new)
 goto out_release_drc_mem;
 conn = alloc_conn_from_crses(rqstp, cr_ses);
 if (!conn)
 goto out_free_session;

 spin_lock(&nn->client_lock);
 unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 conf = find_confirmed_client(&cr_ses->clientid, true, nn);
 WARN_ON_ONCE(conf && unconf);

 if (conf) {
 status = nfserr_wrong_cred;
 if (!nfsd4_mach_creds_match(conf, rqstp))
 goto out_free_conn;
 cs_slot = &conf->cl_cs_slot;
 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
 if (status) {
 if (status == nfserr_replay_cache)
 status = nfsd4_replay_create_session(cr_ses, cs_slot);
 goto out_free_conn;
 }
 } else if (unconf) {
 status = nfserr_clid_inuse;
 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
 trace_nfsd_clid_cred_mismatch(unconf, rqstp);
 goto out_free_conn;
 }
 status = nfserr_wrong_cred;
 if (!nfsd4_mach_creds_match(unconf, rqstp))
 goto out_free_conn;
 cs_slot = &unconf->cl_cs_slot;
 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
 if (status) {
 /* an unconfirmed replay returns misordered */
 status = nfserr_seq_misordered;
 goto out_free_conn;
 }
 old = find_confirmed_client_by_name(&unconf->cl_name, nn);
 if (old) {
 status = mark_client_expired_locked(old);
 if (status) {
 old = NULL;
 goto out_free_conn;
 }
 trace_nfsd_clid_replaced(&old->cl_clientid);
 }
 move_to_confirmed(unconf);
 conf = unconf;
 } else {
 status = nfserr_stale_clientid;
 goto out_free_conn;
 }
 status = nfs_ok;
 /* Persistent sessions are not supported */
 cr_ses->flags &= ~SESSION4_PERSIST;
 /* Upshifting from TCP to RDMA is not supported */
 cr_ses->flags &= ~SESSION4_RDMA;

 init_session(rqstp, new, conf, cr_ses);
 nfsd4_get_session_locked(new);

 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 NFS4_MAX_SESSIONID_LEN);
 cs_slot->sl_seqid++;
 cr_ses->seqid = cs_slot->sl_seqid;

 /* cache solo and embedded create sessions under the client_lock */
 nfsd4_cache_create_session(cr_ses, cs_slot, status);
 spin_unlock(&nn->client_lock);
 if (conf == unconf)
 fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
 /* init connection and backchannel */
 nfsd4_init_conn(rqstp, conn, new);
----
HERE-> nfsd4_put_session(new);
----
 if (old)
 expire_client(old);
 return status;
out_free_conn:
 spin_unlock(&nn->client_lock);
 free_conn(conn);
 if (old)
 expire_client(old);
out_free_session:
 __free_session(new);
out_release_drc_mem:
 nfsd4_put_drc_mem(&cr_ses->fore_channel);
 return status;
}


static void nfsd4_put_session(struct nfsd4_session *ses)
{
--- struct nfs4_client *clp = ses->se_client;
263-> struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
---
----
HERE-> spin_lock(&nn->client_lock);
---- nfsd4_put_session_locked(ses);
 spin_unlock(&nn->client_lock);
}


################################################
3)---- Workqueue: nfsd4 laundromat_main
 hangs on nfsd4_shutdown_callback 
################################################

nfsd4_shutdown_callback in nfs4callback.c is only called
from __destroy_client in nfs4state.c.

One comment say that nfsd4_shutdown_callback must be called
under "state lock". But in __destroy_client the lock is
released before. Maybe because there is first a copy of the
instance ?

static void
__destroy_client(struct nfs4_client *clp)
{
...
 INIT_LIST_HEAD(&reaplist);
 spin_lock(&state_lock);
...
 spin_unlock(&state_lock);
...
 nfsd4_return_all_client_layouts(clp);
 nfsd4_shutdown_copy(clp);
 nfsd4_shutdown_callback(clp);

After the nfsd4_shutdown_callback call, the workqueue seems
to hangs on :
 flush_workqueue(callback_wq)
or 
 nfsd41_cb_inflight_wait_complete(clp)
I don't know if the line reference is reliable.


--------- CALL TRACE ---------
INFO: task kworker/u96:2:39983 blocked for more than 120 seconds.
Not tainted 6.1.0-28-amd64 #1 Debian 6.1.119-1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u96:2 state:D stack:0 pid:39983 ppid:2 flags:0x00004000
Workqueue: nfsd4 laundromat_main [nfsd]
Call Trace:
<TASK>
__schedule+0x34d/0x9e0
schedule+0x5a/0xd0
schedule_timeout+0x118/0x150
wait_for_completion+0x86/0x160
__flush_workqueue+0x152/0x420
nfsd4_shutdown_callback+0x49/0x130 [nfsd]
? _raw_spin_unlock+0xa/0x30
? nfsd4_return_all_client_layouts+0xc4/0xf0 [nfsd]
? nfsd4_shutdown_copy+0x28/0x130 [nfsd]
__destroy_client+0x1f3/0x290 [nfsd]
nfs4_process_client_reaplist+0xa2/0x110 [nfsd]
laundromat_main+0x1ce/0x880 [nfsd]
process_one_work+0x1c7/0x380
worker_thread+0x4d/0x380
? rescuer_thread+0x3a0/0x3a0
kthread+0xda/0x100
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x22/0x30
</TASK>
------- END CALL TRACE -------


------ CALL DECODING ---------
./scripts/faddr2line 
./debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.o 
nfsd4_shutdown_callback+0x49
nfsd4_shutdown_callback+0x49/0x12f:
might_resched à debian/build/build_amd64_none_amd64/include/linux/kernel.h:110
(en ligne par) nfsd41_cb_inflight_wait_complete à 
debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.c:931
(en ligne par) nfsd4_shutdown_callback à 
debian/build/build_amd64_none_amd64/fs/nfsd/nfs4callback.c:1319
---- END CALL DECODING -------


----- SOURCE REFERENCE -------

#####################
#### nfs4state.c: ###
#####################

static void
__destroy_client(struct nfs4_client *clp)
{
 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 int i;
 struct nfs4_openowner *oo;
 struct nfs4_delegation *dp;
 struct list_head reaplist;

 INIT_LIST_HEAD(&reaplist);
 spin_lock(&state_lock);
 while (!list_empty(&clp->cl_delegations)) {
 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
 WARN_ON(!unhash_delegation_locked(dp));
 list_add(&dp->dl_recall_lru, &reaplist);
 }
 spin_unlock(&state_lock);
 while (!list_empty(&reaplist)) {
 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
 list_del_init(&dp->dl_recall_lru);
 destroy_unhashed_deleg(dp);
 }
 while (!list_empty(&clp->cl_revoked)) {
 dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
 list_del_init(&dp->dl_recall_lru);
 nfs4_put_stid(&dp->dl_stid);
 }
 while (!list_empty(&clp->cl_openowners)) {
 oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
 nfs4_get_stateowner(&oo->oo_owner);
 release_openowner(oo);
 }
 for (i = 0; i < OWNER_HASH_SIZE; i++) {
 struct nfs4_stateowner *so, *tmp;

 list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i],
 so_strhash) {
 /* Should be no openowners at this point */
 WARN_ON_ONCE(so->so_is_open_owner);
 remove_blocked_locks(lockowner(so));
 }
 }
 nfsd4_return_all_client_layouts(clp);
---- nfsd4_shutdown_copy(clp);
HERE-> nfsd4_shutdown_callback(clp);
---- if (clp->cl_cb_conn.cb_xprt)
 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
 atomic_add_unless(&nn->nfs4_client_count, -1, 0);
 nfsd4_dec_courtesy_client_count(nn, clp);
 free_client(clp);
 wake_up_all(&expiry_wq);
}

#####################
#### nfs4state.c: ###
#####################

/* must be called under the state lock */
void nfsd4_shutdown_callback(struct nfs4_client *clp)
{
 if (clp->cl_cb_state != NFSD4_CB_UNKNOWN)
 trace_nfsd_cb_shutdown(clp);

 set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
 /*
 * Note this won't actually result in a null callback;
 * instead, nfsd4_run_cb_null() will detect the killed
 * client, destroy the rpc client, and stop:
 */
 nfsd4_run_cb(&clp->cl_cb_null);
????-> flush_workqueue(callback_wq);
1319-> nfsd41_cb_inflight_wait_complete(clp);
}

static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp)
{
HERE-> wait_var_event(&clp->cl_cb_inflight,
 !atomic_read(&clp->cl_cb_inflight));
}

--- END SOURCE REFERENCE -----


################################################
4)---- Workqueue: nfsd4 nfsd4_state_shrinker_worker
 hangs on nfsd4_shutdown_callback 
################################################

Same thing with the nfsd4_state_shrinker_worker workqueue.
So may be the hangs caused by the simultaneous call of
nfsd4_shutdown_callback by the two workqueues ?

nfsd4 laundromat_main
and
nfsd4_state_shrinker_worker

Note the presence of "_raw_spin_unlock" in the call trace.


--------- CALL TRACE ---------
task:kworker/u96:3 state:D stack:0 pid:40084 ppid:2 flags:0x00004000
Workqueue: nfsd4 nfsd4_state_shrinker_worker [nfsd]
Call Trace:
<TASK>
__schedule+0x34d/0x9e0
schedule+0x5a/0xd0
schedule_timeout+0x118/0x150
wait_for_completion+0x86/0x160
__flush_workqueue+0x152/0x420
nfsd4_shutdown_callback+0x49/0x130 [nfsd]
? _raw_spin_unlock+0xa/0x30
? nfsd4_return_all_client_layouts+0xc4/0xf0 [nfsd]
? nfsd4_shutdown_copy+0x28/0x130 [nfsd]
__destroy_client+0x1f3/0x290 [nfsd]
nfs4_process_client_reaplist+0xa2/0x110 [nfsd]
nfsd4_state_shrinker_worker+0xf7/0x320 [nfsd]
process_one_work+0x1c7/0x380
worker_thread+0x4d/0x380
? rescuer_thread+0x3a0/0x3a0
kthread+0xda/0x100
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x22/0x30
</TASK>
------- END CALL TRACE -------


Regards,

Baptiste.

Reply via email to