Oleg Nesterov <o...@redhat.com> writes: > On 08/30, Eric W. Biederman wrote: >> >> --- a/kernel/exit.c >> +++ b/kernel/exit.c >> @@ -182,6 +182,24 @@ static void delayed_put_task_struct(struct rcu_head >> *rhp) >> put_task_struct(tsk); >> } >> >> +void put_dead_task_struct(struct task_struct *task) >> +{ >> + bool delay = false; >> + unsigned long flags; >> + >> + /* Is the task both reaped and no longer being scheduled? */ >> + raw_spin_lock_irqsave(&task->pi_lock, flags); >> + if ((task->state == TASK_DEAD) && >> + (cmpxchg(&task->exit_state, EXIT_DEAD, EXIT_RCU) == EXIT_DEAD)) >> + delay = true; >> + raw_spin_lock_irqrestore(&task->pi_lock, flags); >> + >> + /* If both are true use rcu delay the put_task_struct */ >> + if (delay) >> + call_rcu(&task->rcu, delayed_put_task_struct); >> + else >> + put_task_struct(task); >> +} >> >> void release_task(struct task_struct *p) >> { >> @@ -222,76 +240,13 @@ void release_task(struct task_struct *p) >> >> write_unlock_irq(&tasklist_lock); >> release_thread(p); >> - call_rcu(&p->rcu, delayed_put_task_struct); >> + put_dead_task_struct(p); > > I had a similar change in mind, see below. This is subjective, but to me > it looks more simple and clean. > > Oleg. > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 8dc1811..1f9b021 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1134,7 +1134,10 @@ struct task_struct { > > struct tlbflush_unmap_batch tlb_ubc; > > - struct rcu_head rcu; > + union { > + bool xxx; > + struct rcu_head rcu; > + }; > > /* Cache last used pipe for splice(): */ > struct pipe_inode_info *splice_pipe; > diff --git a/kernel/exit.c b/kernel/exit.c > index a75b6a7..baacfce 100644 > --- a/kernel/exit.c > +++ b/kernel/exit.c > @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) > put_task_struct(tsk); > } > > +void call_delayed_put_task_struct(struct task_struct *p) > +{ > + if (xchg(&p->xxx, 1)) > + call_rcu(&p->rcu, delayed_put_task_struct); > +}
I like using the storage we will later use for the rcu_head. Is the intention your new variable xxx start as 0, and the only on the second write it becomes 1 and we take action? That should work but it is a funny way to encode a decrement. I think it would be more straight forward to use refcount_dec_and_test. So something like this: diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..99a4518b9b17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1142,7 +1142,10 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - struct rcu_head rcu; + union { + refcount_t rcu_users; + struct rcu_head rcu; + }; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0497091e40c1..8bd51af44bf8 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -115,7 +115,7 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -struct task_struct *task_rcu_dereference(struct task_struct **ptask); +void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/kernel/exit.c b/kernel/exit.c index 5b4a5dcce8f8..a42fd8889036 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(tsk); } +void put_task_struct_rcu_user(struct task_struct *task) +{ + if (refcount_dec_and_test(&task->rcu_users)) + call_rcu(&task->rcu, delayed_put_task_struct); +} void release_task(struct task_struct *p) { @@ -222,10 +227,10 @@ void release_task(struct task_struct *p) write_unlock_irq(&tasklist_lock); release_thread(p); - call_rcu(&p->rcu, delayed_put_task_struct); + put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } diff --git a/kernel/fork.c b/kernel/fork.c index 2852d0e76ea3..dc4799210e05 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -900,11 +900,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; - /* - * One for us, one for whoever does the "release_task()" (usually - * parent) + /* One for the user space visible state that goes away when + * the processes zombie is reaped. + * One for the reference from the scheduler. + * + * The reference count is ignored and free_task is called + * directly until copy_process completes. */ - refcount_set(&tsk->usage, 2); + refcount_set(&tsk->rcu_users, 2); + refcount_set(&tsk->usage, 1); /* One for the rcu users */ #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..69015b7c28da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3135,7 +3135,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) /* Task is done with its stack. */ put_task_stack(prev); - put_task_struct(prev); + put_task_struct_rcu_user(prev); } tick_nohz_task_switch(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..082f8ba2b1f4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -892,7 +892,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; Eric