On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote:
> On 07/27/2013 07:19 AM, Paul E. McKenney wrote:
> > From: "Paul E. McKenney" <paul...@linux.vnet.ibm.com>
> > 
> > This commit adds the state machine that takes the per-CPU idle data
> > as input and produces a full-system-idle indication as output.  This
> > state machine is driven out of RCU's quiescent-state-forcing
> > mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
> > idle state and then rcu_sysidle_report() to drive the state machine.
> > 
> > The full-system-idle state is sampled using rcu_sys_is_idle(), which
> > also drives the state machine if RCU is idle (and does so by forcing
> > RCU to become non-idle).  This function returns true if all but the
> > timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
> > enough to avoid memory contention on the full_sysidle_state state
> > variable.  The rcu_sysidle_force_exit() may be called externally
> > to reset the state machine back into non-idle state.
> > 
> > Signed-off-by: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> > Cc: Frederic Weisbecker <fweis...@gmail.com>
> > Cc: Steven Rostedt <rost...@goodmis.org>
> > ---
> >  include/linux/rcupdate.h |  18 +++
> >  kernel/rcutree.c         |  16 ++-
> >  kernel/rcutree.h         |   5 +
> >  kernel/rcutree_plugin.h  | 284 
> > ++++++++++++++++++++++++++++++++++++++++++++++-
> >  4 files changed, 316 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index 48f1ef9..1aa8d8c 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
> > false; }
> >  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
> >  
> >  
> > +/* Only for use by adaptive-ticks code. */
> > +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
> > +extern bool rcu_sys_is_idle(void);
> > +extern void rcu_sysidle_force_exit(void);
> > +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> > +
> > +static inline bool rcu_sys_is_idle(void)
> > +{
> > +   return false;
> > +}
> > +
> > +static inline void rcu_sysidle_force_exit(void)
> > +{
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> > +
> > +
> >  #endif /* __LINUX_RCUPDATE_H */
> > diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> > index 725524e..aa6d96e 100644
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct 
> > rcu_data *rdp,
> >                                      bool *isidle, unsigned long *maxj)
> >  {
> >     rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
> > +   rcu_sysidle_check_cpu(rdp, isidle, maxj);
> >     return (rdp->dynticks_snap & 0x1) == 0;
> >  }
> >  
> > @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
> > fqs_state_in)
> >     rsp->n_force_qs++;
> >     if (fqs_state == RCU_SAVE_DYNTICK) {
> >             /* Collect dyntick-idle snapshots. */
> > +           if (is_sysidle_rcu_state(rsp)) {
> > +                   isidle = 1;
> 
> isidle = true;
> the type of isidle is bool
> 
> > +                   maxj = jiffies - ULONG_MAX / 4;
> > +           }
> >             force_qs_rnp(rsp, dyntick_save_progress_counter,
> >                          &isidle, &maxj);
> > +           rcu_sysidle_report_gp(rsp, isidle, maxj);
> >             fqs_state = RCU_FORCE_QS;
> >     } else {
> >             /* Handle dyntick-idle and offline CPUs. */
> > +           isidle = 0;
> 
> isidle = false;
> 
> >             force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
> >     }
> >     /* Clear flag to prevent immediate re-entry. */
> > @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
> >             cpu = rnp->grplo;
> >             bit = 1;
> >             for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> > -                   if ((rnp->qsmask & bit) != 0 &&
> > -                       f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> > -                           mask |= bit;
> > +                   if ((rnp->qsmask & bit) != 0) {
> > +                           if ((rnp->qsmaskinit & bit) != 0)
> > +                                   *isidle = 0;
> 
> *isidle = false

All good catches, fixed.

> > +                           if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> > +                                   mask |= bit;
> > +                   }
> >             }
> >             if (mask != 0) {
> >  
> > diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> > index 1895043..e0de5dc 100644
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
> >  static bool init_nocb_callback_list(struct rcu_data *rdp);
> >  static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
> >  static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> > +                             unsigned long *maxj);
> > +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
> > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> > +                             unsigned long maxj);
> >  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
> >  
> >  #endif /* #ifndef RCU_TREE_NONCORE */
> > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> > index 3edae39..ff84bed 100644
> > --- a/kernel/rcutree_plugin.h
> > +++ b/kernel/rcutree_plugin.h
> > @@ -28,7 +28,7 @@
> >  #include <linux/gfp.h>
> >  #include <linux/oom.h>
> >  #include <linux/smpboot.h>
> > -#include <linux/tick.h>
> > +#include "time/tick-internal.h"
> >  
> >  #define RCU_KTHREAD_PRIO 1
> >  
> > @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
> >   * most active flavor of RCU.
> >   */
> >  #ifdef CONFIG_PREEMPT_RCU
> > -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> > &rcu_preempt_state;
> > +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
> >  #else /* #ifdef CONFIG_PREEMPT_RCU */
> > -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> > &rcu_sched_state;
> > +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
> >  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
> >  
> > -static int __maybe_unused full_sysidle_state; /* Current system-idle 
> > state. */
> > +static int full_sysidle_state;             /* Current system-idle state. */
> >  #define RCU_SYSIDLE_NOT            0       /* Some CPU is not idle. */
> >  #define RCU_SYSIDLE_SHORT  1       /* All CPUs idle for brief period. */
> >  #define RCU_SYSIDLE_LONG   2       /* All CPUs idle for long enough. */
> > @@ -2444,6 +2444,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks 
> > *rdtp, int irq)
> >  }
> >  
> >  /*
> > + * Unconditionally force exit from full system-idle state.  This is
> > + * invoked when a normal CPU exits idle, but must be called separately
> > + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
> > + * is that the timekeeping CPU is permitted to take scheduling-clock
> > + * interrupts while the system is in system-idle state, and of course
> > + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
> > + * interrupt from any other type of interrupt.
> > + */
> > +void rcu_sysidle_force_exit(void)
> > +{
> > +   int oldstate = ACCESS_ONCE(full_sysidle_state);
> > +   int newoldstate;
> > +
> > +   /*
> > +    * Each pass through the following loop attempts to exit full
> > +    * system-idle state.  If contention proves to be a problem,
> > +    * a trylock-based contention tree could be used here.
> > +    */
> > +   while (oldstate > RCU_SYSIDLE_SHORT) {
> > +           newoldstate = cmpxchg(&full_sysidle_state,
> > +                                 oldstate, RCU_SYSIDLE_NOT);
> > +           if (oldstate == newoldstate &&
> > +               oldstate == RCU_SYSIDLE_FULL_NOTED) {
> > +                   rcu_kick_nohz_cpu(tick_do_timer_cpu);
> > +                   return; /* We cleared it, done! */
> > +           }
> > +           oldstate = newoldstate;
> > +   }
> > +   smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
> 
> why we need this mb()?
> which mb() is paired with this?

This one if for the case where we didn't do the cmpxchg() above.  The idea
is that if we saw that oldstate was RCU_SYSIDLE_SHORT or RCU_SYSIDLE_NOT,
that anyone attempting to update the value who has seen any of our
later non-idle activity also "sees" the load from full_sysidle_state,
which reduces the state space a bit.  The barrier pairs with the various
cmpxchg() operations that advance full_sysidle_state.

                                                        Thanx, Paul

> > +}
> > +
> > +/*
> >   * Invoked to note entry to irq or task transition from idle.  Note that
> >   * usermode execution does -not- count as idle here!  The caller must
> >   * have disabled interrupts.
> > @@ -2476,6 +2508,235 @@ static void rcu_sysidle_exit(struct rcu_dynticks 
> > *rdtp, int irq)
> >     atomic_inc(&rdtp->dynticks_idle);
> >     smp_mb__after_atomic_inc();
> >     WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
> > +
> > +   /*
> > +    * If we are the timekeeping CPU, we are permitted to be non-idle
> > +    * during a system-idle state.  This must be the case, because
> > +    * the timekeeping CPU has to take scheduling-clock interrupts
> > +    * during the time that the system is transitioning to full
> > +    * system-idle state.  This means that the timekeeping CPU must
> > +    * invoke rcu_sysidle_force_exit() directly if it does anything
> > +    * more than take a scheduling-clock interrupt.
> > +    */
> > +   if (smp_processor_id() == tick_do_timer_cpu)
> > +           return;
> > +
> > +   /* Update system-idle state: We are clearly no longer fully idle! */
> > +   rcu_sysidle_force_exit();
> > +}
> > +
> > +/*
> > + * Check to see if the current CPU is idle.  Note that usermode execution
> > + * does not count as idle.  The caller must have disabled interrupts.
> > + */
> > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> > +                             unsigned long *maxj)
> > +{
> > +   int cur;
> > +   unsigned long j;
> > +   struct rcu_dynticks *rdtp = rdp->dynticks;
> > +
> > +   /*
> > +    * If some other CPU has already reported non-idle, if this is
> > +    * not the flavor of RCU that tracks sysidle state, or if this
> > +    * is an offline or the timekeeping CPU, nothing to do.
> > +    */
> > +   if (!*isidle || rdp->rsp != rcu_sysidle_state ||
> > +       cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
> > +           return;
> > +   /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */
> > +
> > +   /* Pick up current idle and NMI-nesting counter and check. */
> > +   cur = atomic_read(&rdtp->dynticks_idle);
> > +   if (cur & 0x1) {
> > +           *isidle = 0; /* We are not idle! */
> 
> *isidle = false;
> 
> And other places which use "isidle".
> 
> > +           return;
> > +   }
> > +   smp_mb(); /* Read counters before timestamps. */
> > +
> > +   /* Pick up timestamps. */
> > +   j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
> > +   /* If this CPU entered idle more recently, update maxj timestamp. */
> > +   if (ULONG_CMP_LT(*maxj, j))
> > +           *maxj = j;
> > +}
> > +
> > +/*
> > + * Is this the flavor of RCU that is handling full-system idle?
> > + */
> > +static bool is_sysidle_rcu_state(struct rcu_state *rsp)
> > +{
> > +   return rsp == rcu_sysidle_state;
> > +}
> > +
> > +/*
> > + * Return a delay in jiffies based on the number of CPUs, rcu_node
> > + * leaf fanout, and jiffies tick rate.  The idea is to allow larger
> > + * systems more time to transition to full-idle state in order to
> > + * avoid the cache thrashing that otherwise occur on the state variable.
> > + * Really small systems (less than a couple of tens of CPUs) should
> > + * instead use a single global atomically incremented counter, and later
> > + * versions of this will automatically reconfigure themselves accordingly.
> > + */
> > +static unsigned long rcu_sysidle_delay(void)
> > +{
> > +   if (nr_cpu_ids <= RCU_SYSIDLE_SMALL)
> > +           return 0;
> > +   return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
> > +}
> > +
> > +/*
> > + * Advance the full-system-idle state.  This is invoked when all of
> > + * the non-timekeeping CPUs are idle.
> > + */
> > +static void rcu_sysidle(unsigned long j)
> > +{
> > +   /* Check the current state. */
> > +   switch (ACCESS_ONCE(full_sysidle_state)) {
> > +   case RCU_SYSIDLE_NOT:
> > +
> > +           /* First time all are idle, so note a short idle period. */
> > +           ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
> > +           break;
> > +
> > +   case RCU_SYSIDLE_SHORT:
> > +
> > +           /*
> > +            * Idle for a bit, time to advance to next state?
> > +            * cmpxchg failure means race with non-idle, let them win.
> > +            */
> > +           if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
> > +                   (void)cmpxchg(&full_sysidle_state,
> > +                                 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
> > +           break;
> 
> I don't think it will race with any body.
> I think ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_LONG is enough.
> 
> note:
> rcu_sysidle_force_exit() doesn't change full_sysidle_state if it is 
> RCU_SYSIDLE_SHORT.
> 
> > +
> > +   case RCU_SYSIDLE_LONG:
> > +
> > +           /*
> > +            * Do an additional check pass before advancing to full.
> > +            * cmpxchg failure means race with non-idle, let them win.
> > +            */
> > +           if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
> > +                   (void)cmpxchg(&full_sysidle_state,
> > +                                 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
> > +           break;
> > +
> > +   default:
> > +           break;
> > +   }
> > +}
> > +
> > +/*
> > + * Found a non-idle non-timekeeping CPU, so kick the system-idle state
> > + * back to the beginning.
> > + */
> > +static void rcu_sysidle_cancel(void)
> > +{
> > +   smp_mb();
> > +   ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
> > +}
> > +
> > +/*
> > + * Update the sysidle state based on the results of a force-quiescent-state
> > + * scan of the CPUs' dyntick-idle state.
> > + */
> > +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
> > +                          unsigned long maxj, bool gpkt)
> > +{
> > +   if (rsp != rcu_sysidle_state)
> > +           return;  /* Wrong flavor, ignore. */
> > +   if (isidle) {
> > +           if (gpkt && nr_cpu_ids > RCU_SYSIDLE_SMALL)
> > +                   rcu_sysidle(maxj);    /* More idle! */
> > +   } else {
> > +           rcu_sysidle_cancel(); /* Idle is over. */
> > +   }
> > +}
> 
> "gpkt" is always equal to "nr_cpu_ids > RCU_SYSIDLE_SMALL",
> 
> so we can remove "gpkt" argument and rcu_sysidle_report_gp().
> 
> 
> > +
> > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> > +                             unsigned long maxj)
> > +{
> > +   rcu_sysidle_report(rsp, isidle, maxj, true);
> > +}
> > +
> > +/* Callback and function for forcing an RCU grace period. */
> > +struct rcu_sysidle_head {
> > +   struct rcu_head rh;
> > +   int inuse;
> > +};
> > +
> > +static void rcu_sysidle_cb(struct rcu_head *rhp)
> > +{
> > +   struct rcu_sysidle_head *rshp;
> > +
> > +   smp_mb();  /* grace period precedes setting inuse. */
> 
> Why we need this mb()?
> 
> > +   rshp = container_of(rhp, struct rcu_sysidle_head, rh);
> > +   ACCESS_ONCE(rshp->inuse) = 0;
> > +}
> > +
> > +/*
> > + * Check to see if the system is fully idle, other than the timekeeping 
> > CPU.
> > + * The caller must have disabled interrupts.
> > + */
> > +bool rcu_sys_is_idle(void)
> > +{
> > +   static struct rcu_sysidle_head rsh;
> > +   int rss = ACCESS_ONCE(full_sysidle_state);
> > +
> > +   if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
> > +           return false;
> > +
> > +   /* Handle small-system case by doing a full scan of CPUs. */
> > +   if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) {
> > +           int oldrss = rss - 1;
> > +
> > +           /*
> > +            * One pass to advance to each state up to _FULL.
> > +            * Give up if any pass fails to advance the state.
> > +            */
> > +           while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
> > +                   int cpu;
> > +                   bool isidle = true;
> > +                   unsigned long maxj = jiffies - ULONG_MAX / 4;
> > +                   struct rcu_data *rdp;
> > +
> > +                   /* Scan all the CPUs looking for nonidle CPUs. */
> > +                   for_each_possible_cpu(cpu) {
> > +                           rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
> > +                           rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
> > +                           if (!isidle)
> > +                                   break;
> > +                   }
> > +                   rcu_sysidle_report(rcu_sysidle_state,
> > +                                      isidle, maxj, false);
> > +                   oldrss = rss;
> > +                   rss = ACCESS_ONCE(full_sysidle_state);
> > +           }
> > +   }
> > +
> > +   /* If this is the first observation of an idle period, record it. */
> > +   if (rss == RCU_SYSIDLE_FULL) {
> > +           rss = cmpxchg(&full_sysidle_state,
> > +                         RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
> > +           return rss == RCU_SYSIDLE_FULL;
> > +   }
> > +
> > +   smp_mb(); /* ensure rss load happens before later caller actions. */
> > +
> > +   /* If already fully idle, tell the caller (in case of races). */
> > +   if (rss == RCU_SYSIDLE_FULL_NOTED)
> > +           return true;
> > +
> > +   /*
> > +    * If we aren't there yet, and a grace period is not in flight,
> > +    * initiate a grace period.  Either way, tell the caller that
> > +    * we are not there yet.
> > +    */
> > +   if (nr_cpu_ids > RCU_SYSIDLE_SMALL &&
> > +       !rcu_gp_in_progress(rcu_sysidle_state) &&
> > +       !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
> 
> why need to use xchg()? Who will it race with?
> 
> > +           call_rcu(&rsh.rh, rcu_sysidle_cb);
> > +   return false;
> >  }
> >  
> >  /*
> > @@ -2496,6 +2757,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks 
> > *rdtp, int irq)
> >  {
> >  }
> >  
> > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> > +                             unsigned long *maxj)
> > +{
> > +}
> > +
> > +static bool is_sysidle_rcu_state(struct rcu_state *rsp)
> > +{
> > +   return false;
> > +}
> > +
> > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> > +                             unsigned long maxj)
> > +{
> > +}
> > +
> >  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
> >  {
> >  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to