On Fri, 26 Feb 2021 10:28:00 -0800 Wei Wang wrote: > Hi Martin, > Could you help try the following new patch on your setup and let me > know if there are still issues?
FWIW your email got line wrapped for me. > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > index ddf4cfc12615..9ed0f89ccdd5 100644 > --- a/include/linux/netdevice.h > +++ b/include/linux/netdevice.h > @@ -357,9 +357,10 @@ enum { > NAPI_STATE_NPSVC, /* Netpoll - don't dequeue > from poll_list */ > NAPI_STATE_LISTED, /* NAPI added to system lists */ > NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no > busy polling */ > - NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ > + NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() grabs SHED nit: SHED -> SCHED > bit and could busy poll */ > NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over > softirq processing*/ > NAPI_STATE_THREADED, /* The poll is performed > inside its own thread*/ > + NAPI_STATE_SCHED_BUSY_POLL, /* Napi is currently scheduled > in busy poll mode */ nit: Napi -> NAPI ? > }; > > enum { > @@ -372,6 +373,7 @@ enum { > NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), > NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), > NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), > + NAPIF_STATE_SCHED_BUSY_POLL = BIT(NAPI_STATE_SCHED_BUSY_POLL), > }; > > enum gro_result { > diff --git a/net/core/dev.c b/net/core/dev.c > index 6c5967e80132..c717b67ce137 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -1501,15 +1501,14 @@ static int napi_kthread_create(struct napi_struct *n) > { > int err = 0; > > - /* Create and wake up the kthread once to put it in > - * TASK_INTERRUPTIBLE mode to avoid the blocked task > - * warning and work with loadavg. > + /* Avoid using kthread_run() here to prevent race > + * between softirq and kthread polling. > */ > - n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", > - n->dev->name, n->napi_id); > + n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d", > + n->dev->name, n->napi_id); I'm not sure this takes care of rapid: dev_set_threaded(0) # NAPI gets sent to sirq dev_set_threaded(1) since subsequent set_threaded(1) doesn't spawn the thread "afresh". > if (IS_ERR(n->thread)) { > err = PTR_ERR(n->thread); > - pr_err("kthread_run failed with err %d\n", err); > + pr_err("kthread_create failed with err %d\n", err); > n->thread = NULL; > } > > @@ -6486,6 +6485,7 @@ bool napi_complete_done(struct napi_struct *n, > int work_done) > WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); > > new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | > + NAPIF_STATE_SCHED_BUSY_POLL | > NAPIF_STATE_PREFER_BUSY_POLL); > > /* If STATE_MISSED was set, leave STATE_SCHED set, > @@ -6525,6 +6525,7 @@ static struct napi_struct *napi_by_id(unsigned > int napi_id) > > static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) > { > + clear_bit(NAPI_STATE_SCHED_BUSY_POLL, &napi->state); > if (!skip_schedule) { > gro_normal_list(napi); > __napi_schedule(napi); > @@ -6624,7 +6625,8 @@ void napi_busy_loop(unsigned int napi_id, > } > if (cmpxchg(&napi->state, val, > val | NAPIF_STATE_IN_BUSY_POLL | > - NAPIF_STATE_SCHED) != val) { > + NAPIF_STATE_SCHED | > + NAPIF_STATE_SCHED_BUSY_POLL) != > val) { > if (prefer_busy_poll) > > set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); > goto count; > @@ -6971,7 +6973,10 @@ static int napi_thread_wait(struct napi_struct *napi) > set_current_state(TASK_INTERRUPTIBLE); > > while (!kthread_should_stop() && !napi_disable_pending(napi)) { > - if (test_bit(NAPI_STATE_SCHED, &napi->state)) { > + unsigned long val = READ_ONCE(napi->state); > + > + if (val & NAPIF_STATE_SCHED && > + !(val & NAPIF_STATE_SCHED_BUSY_POLL)) { Again, not protected from the napi_disable() case AFAICT. > WARN_ON(!list_empty(&napi->poll_list)); > __set_current_state(TASK_RUNNING); > return 0;