This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of kicking and
wakeup latencies which have a heavy dependency on the CPUs being used.

The measured latencies for different CPUs are:

        CPU             Wakeup          Kicking
        ---             ------          -------
        Haswell-EX      89.8us           7.4us
        Westmere-EX     67.6us           9.3us

The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.

A sample of statistics counts after a kernel build (no CPU overcommit)
was:

hash_count=93
hash_hops_count=93
kick_latencies=6997535456
kick_time_count=755426
lock_kick_count=755354
pending_fail_count=10619
pending_lock_count=2912098
spurious_wakeup=8
unlock_kick_count=87
wait_head_count=91
wait_node_count=755359
wake_latencies=51052626984

Signed-off-by: Waiman Long <waiman.l...@hp.com>
---
 arch/x86/Kconfig                    |    7 ++
 kernel/locking/qspinlock_paravirt.h |  165 ++++++++++++++++++++++++++++++++++-
 2 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced1..299a1c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -679,6 +679,13 @@ config PARAVIRT_SPINLOCKS
 
          If you are unsure how to answer this question, answer Y.
 
+config QUEUED_LOCK_STAT
+       bool "Paravirt queued lock statistics"
+       depends on PARAVIRT && DEBUG_FS && QUEUED_SPINLOCKS
+       ---help---
+         Enable the collection of statistical data on the behavior of
+         paravirtualized queued spinlocks and report them on debugfs.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_GUEST
diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index c43dec7..c8485c4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -43,6 +43,153 @@ struct pv_node {
 };
 
 /*
+ * PV qspinlock statistics
+ */
+enum pv_qlock_stat {
+       pvstat_wait_head,
+       pvstat_wait_node,
+       pvstat_kick_time,
+       pvstat_lock_kick,
+       pvstat_unlock_kick,
+       pvstat_pend_lock,
+       pvstat_pend_fail,
+       pvstat_spurious,
+       pvstat_hash,
+       pvstat_hops,
+       pvstat_num      /* Total number of statistics counts */
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statiatics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+
+static const char * const stat_fsnames[pvstat_num] = {
+       [pvstat_wait_head]   = "wait_head_count",
+       [pvstat_wait_node]   = "wait_node_count",
+       [pvstat_kick_time]   = "kick_time_count",
+       [pvstat_lock_kick]   = "lock_kick_count",
+       [pvstat_unlock_kick] = "unlock_kick_count",
+       [pvstat_pend_lock]   = "pending_lock_count",
+       [pvstat_pend_fail]   = "pending_fail_count",
+       [pvstat_spurious]    = "spurious_wakeup",
+       [pvstat_hash]        = "hash_count",
+       [pvstat_hops]        = "hash_hops_count",
+};
+
+static atomic_t pvstats[pvstat_num];
+
+/*
+ * pv_kick_latencies = sum of all pv_kick latencies in ns
+ * pv_wake_latencies = sum of all wakeup latencies in ns
+ *
+ * Avg kick latency = pv_kick_latencies/(lock_kick_count + unlock_kick_count)
+ * Avg wake latency = pv_wake_latencies/kick_time_count
+ */
+static atomic64_t pv_kick_latencies, pv_wake_latencies;
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Reset all the statistics counts if set
+ */
+static bool reset_cnts __read_mostly;
+
+/*
+ * Initialize debugfs for the PV qspinlock statistics
+ */
+static int __init pv_qspinlock_debugfs(void)
+{
+       struct dentry *d_pvqlock = debugfs_create_dir("pv-qspinlock", NULL);
+       int i;
+
+       if (!d_pvqlock)
+               pr_warn("Could not create 'pv-qspinlock' debugfs directory\n");
+
+       for (i = 0; i < pvstat_num; i++)
+               debugfs_create_u32(stat_fsnames[i], 0444, d_pvqlock,
+                                 (u32 *)&pvstats[i]);
+       debugfs_create_u64("kick_latencies", 0444, d_pvqlock,
+                          (u64 *)&pv_kick_latencies);
+       debugfs_create_u64("wake_latencies", 0444, d_pvqlock,
+                          (u64 *)&pv_wake_latencies);
+       debugfs_create_bool("reset_cnts", 0644, d_pvqlock, (u32 *)&reset_cnts);
+       return 0;
+}
+fs_initcall(pv_qspinlock_debugfs);
+
+/*
+ * Reset all the counts
+ */
+static noinline void pvstat_reset(void)
+{
+       int i;
+
+       for (i = 0; i < pvstat_num; i++)
+               atomic_set(&pvstats[i], 0);
+       atomic64_set(&pv_kick_latencies, 0);
+       atomic64_set(&pv_wake_latencies, 0);
+       reset_cnts = 0;
+}
+
+/*
+ * Increment the PV qspinlock statistics counts
+ */
+static inline void pvstat_inc(enum pv_qlock_stat stat)
+{
+       atomic_inc(&pvstats[stat]);
+       if (unlikely(reset_cnts))
+               pvstat_reset();
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void pvstat_hop(int hopcnt)
+{
+       atomic_inc(&pvstats[pvstat_hash]);
+       atomic_add(hopcnt, &pvstats[pvstat_hops]);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+       u64 start = sched_clock();
+
+       *per_cpu_ptr(&pv_kick_time, cpu) = start;
+       pv_kick(cpu);
+       atomic64_add(sched_clock() - start, &pv_kick_latencies);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+       u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+       *pkick_time = 0;
+       pv_wait(ptr, val);
+       if (*pkick_time) {
+               atomic64_add(sched_clock() - *pkick_time, &pv_wake_latencies);
+               pvstat_inc(pvstat_kick_time);
+       }
+}
+
+#define pv_kick(c)     __pv_kick(c)
+#define pv_wait(p, v)  __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void pvstat_inc(enum pv_qlock_stat stat) { }
+static inline void pvstat_hop(int hopcnt)              { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
+
+/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -102,10 +249,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, 
struct pv_node *node)
 {
        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
        struct pv_hash_entry *he;
+       int hopcnt = 0;
 
        for_each_hash_entry(he, offset, hash) {
+               hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
+                       pvstat_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -208,11 +358,13 @@ static int pv_pending_lock(struct qspinlock *lock, u32 
val)
         * Clear the pending bit and fall back to queuing
         */
        clear_pending(lock);
+       pvstat_inc(pvstat_pend_fail);
 
 queue:
        return 0;
 
 gotlock:
+       pvstat_inc(pvstat_pend_lock);
        return 1;
 }
 
@@ -243,14 +395,18 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 */
                smp_store_mb(pn->state, vcpu_halted);
 
-               if (!READ_ONCE(node->locked))
+               if (!READ_ONCE(node->locked)) {
+                       pvstat_inc(pvstat_wait_node);
                        pv_wait(&pn->state, vcpu_halted);
+               }
 
                /*
                 * Reset the vCPU state to avoid unncessary CPU kicking
                 */
                WRITE_ONCE(pn->state, vcpu_running);
 
+               if (READ_ONCE(node->locked))
+                       break;
                /*
                 * If the locked flag is still not set after wakeup, it is a
                 * spurious wakeup and the vCPU should wait again. However,
@@ -258,6 +414,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
+               pvstat_inc(pvstat_spurious);
        }
        /*
         * By now our node->locked should be 1 and our caller will not actually
@@ -283,8 +440,10 @@ static void pv_kick_node(struct mcs_spinlock *node)
         *
         * See the comment in pv_wait_node().
         */
-       if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+       if (xchg(&pn->state, vcpu_running) == vcpu_halted) {
+               pvstat_inc(pvstat_lock_kick);
                pv_kick(pn->cpu);
+       }
 }
 
 /*
@@ -326,6 +485,7 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
                                return;
                        }
                }
+               pvstat_inc(pvstat_wait_head);
                pv_wait(&l->locked, _Q_SLOW_VAL);
 
                /*
@@ -376,6 +536,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock 
*lock)
         * At this point the memory pointed at by lock can be freed/reused,
         * however we can still use the pv_node to kick the CPU.
         */
+       pvstat_inc(pvstat_unlock_kick);
        pv_kick(node->cpu);
 }
 /*
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to