[PATCH RFC 2/3] mutex: restrict mutex spinning to only one task per mutex

2013-04-04 Thread Waiman Long
The current mutex spinning code allow multiple tasks to spin on a
single mutex concurrently. There are two major problems with this
approach:

 1. This is not very energy efficient as the spinning tasks are not
doing useful work. The spinning tasks may also block other more
important or useful tasks from running as preemption is disabled.
Only one of the spinners will get the mutex at any time. The
other spinners will have to wait for much longer to get it.

 2. The mutex data structure on x86-64 should be 32 bytes. The spinning
code spin on lock->owner which, in most cases, should be in the same
64-byte cache line as the lock->wait_lock spinlock. As a result,
the mutex spinners are contending the same cacheline with other
CPUs trying to get the spinlock leading to increased time spent
on the spinlock as well as on the mutex spinning.

These problems are worse on system with large number of CPUs. One way
to reduce the effect of these two problems is to allow only one task
to be spinning on a mutex at any time.

This patch adds a new spinner field in the mutex.h to limit the
number of spinner to only one task. That will increase the size of
the mutex by 8 bytes in a 64-bit environment (4 bytes in a 32-bit
environment).

The AIM7 benchmarks were run on 3.7.10 derived kernels to show the
performance changes with this patch on a 8-socket 80-core system
with hyperthreading off.  The table below shows the mean % change
in performance over a range of users for some AIM7 workloads with
just the less atomic operation patch (patch 1) vs the less atomic
operation patch plus this one (patches 1+2).

+--+-+-+-+
|   Workload   | mean % change   | mean % change   | mean % change   |
|  | 10-100 users| 200-1000 users  | 1100-2000 users |
+--+-+-+-+
| alltests | -0.2%   | -3.8%   |-4.2%|
| five_sec | -0.6%   | -2.0%   |-2.4%|
| fserver  | +2.2%   |+16.2%   |+2.2%|
| high_systime | -0.3%   | -4.3%   |-3.0%|
| new_fserver  | +3.9%   |+16.0%   |+9.5%|
| shared   | -1.7%   | -5.0%   |-4.0%|
| short| -7.7%   | +0.2%   |+1.3%|
+--+-+-+-+

It can be seen that this patch improves performance for the fserver and
new_fserver workloads while suffering some slight drop in performance
for the other workloads.

Signed-off-by: Waiman Long 
Reviewed-by: Davidlohr Bueso 
---
 include/linux/mutex.h |3 +++
 kernel/mutex.c|   12 
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9121595..dd8fdb8 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -50,6 +50,9 @@ struct mutex {
atomic_tcount;
spinlock_t  wait_lock;
struct list_headwait_list;
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+   struct task_struct  *spinner;
+#endif
 #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
struct task_struct  *owner;
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5e5ea03..965f59f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -158,7 +158,12 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 *
 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
 * to serialize everything.
+*
+* Only first task is allowed to spin on a given mutex and that
+* task will put its task_struct pointer into the spinner field.
 */
+   if (lock->spinner || (cmpxchg(&lock->spinner, NULL, current) != NULL))
+   goto slowpath;
 
for (;;) {
struct task_struct *owner;
@@ -175,6 +180,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
(atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
+   lock->spinner = NULL;
preempt_enable();
return 0;
}
@@ -196,6 +202,12 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 */
arch_mutex_cpu_relax();
}
+
+   /*
+* Done with spinning
+*/
+   lock->spinner = NULL;
+slowpath:
 #endif
spin_lock_mutex(&lock->wait_lock, flags);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://

[PATCH RFC 1/3] mutex: Make more scalable by doing less atomic operations

2013-04-04 Thread Waiman Long
).

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| alltests | +0.6% |   +104.2%  |   +185.9%   |
| five_sec | +1.9% | +0.9%  | +0.9%   |
| fserver  | +1.4% | -7.7%  | +5.1%   |
| new_fserver  | -0.5% | +3.2%  | +3.1%   |
| shared   |+13.1% |   +146.1%  |   +181.5%   |
| short| +7.4% | +5.0%  | +4.2%   |
+--+---++-+

Signed-off-by: Waiman Long 
Reviewed-by: Davidlohr Bueso 
---
 arch/x86/include/asm/mutex.h |   16 
 kernel/mutex.c   |9 ++---
 kernel/mutex.h   |8 
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
index 7d3a482..aa6a3ec 100644
--- a/arch/x86/include/asm/mutex.h
+++ b/arch/x86/include/asm/mutex.h
@@ -3,3 +3,19 @@
 #else
 # include 
 #endif
+
+#ifndef__ASM_MUTEX_H
+#define__ASM_MUTEX_H
+
+#ifdef MUTEX_SHOULD_XCHG_COUNT
+#undef MUTEX_SHOULD_XCHG_COUNT
+#endif
+/*
+ * For the x86 architecture, it allows any negative number (besides -1) in
+ * the mutex counter to indicate that some other threads are waiting on the
+ * mutex. So the atomic_xchg() function should not be called in
+ * __mutex_lock_common() if the value of the counter has already been set
+ * to a negative number.
+ */
+#define MUTEX_SHOULD_XCHG_COUNT(mutex) (atomic_read(&(mutex)->count) >= 0)
+#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f2301..5e5ea03 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -171,7 +171,8 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
if (owner && !mutex_spin_on_owner(lock, owner))
break;
 
-   if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+   if ((atomic_read(&lock->count) == 1) &&
+   (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
preempt_enable();
@@ -205,7 +206,8 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
 
-   if (atomic_xchg(&lock->count, -1) == 1)
+   if (MUTEX_SHOULD_XCHG_COUNT(lock) &&
+  (atomic_xchg(&lock->count, -1) == 1))
goto done;
 
lock_contended(&lock->dep_map, ip);
@@ -220,7 +222,8 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 * that when we release the lock, we properly wake up the
 * other waiters:
 */
-   if (atomic_xchg(&lock->count, -1) == 1)
+   if (MUTEX_SHOULD_XCHG_COUNT(lock) &&
+  (atomic_xchg(&lock->count, -1) == 1))
break;
 
/*
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 4115fbf..b873f8e 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -46,3 +46,11 @@ static inline void
 debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
 {
 }
+
+/*
+ * The atomic_xchg() function should not be called in __mutex_lock_common()
+ * if the value of the counter has already been set to -1.
+ */
+#ifndef MUTEX_SHOULD_XCHG_COUNT
+#defineMUTEX_SHOULD_XCHG_COUNT(mutex)  (atomic_read(&(mutex)->count) 
!= -1)
+#endif
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC 0/3] mutex: Improve mutex performance by doing less atomic-ops & spinning

2013-04-04 Thread Waiman Long
This patch set is a collection of 3 different mutex related patches
aimed at improving mutex performance especially for system with large
number of CPUs. This is achieved by doing less atomic operations and
mutex spinning (when the CONFIG_MUTEX_SPIN_ON_OWNER is on).

The first patch reduces the number of atomic operations executed. It
can produce dramatic performance improvement in the AIM7 benchmark
with large number of CPUs. For example, there was a more than 3X
improvement in the high_systime workload with a 3.7.10 kernel on
an 8-socket x86-64 system with 80 cores. The 3.8 kernels, on the
other hand, are not mutex limited for that workload anymore. So the
performance improvement is only about 1% for the high_systime workload.

Patches 2 and 3 represent different ways to reduce mutex spinning. Of
the 2, the third one is better from both a performance perspective
and the fact that no mutex data structure change is needed. See the
individual patch descriptions for more information on those patches.

The table below shows the performance impact on the AIM7 benchmark with
a 3.8.5 kernel running on the same 8-socket system mentioned above:

+--+--+
|   Workload   |  Mean % Change 10-100 users  |
|  +-+-+--+
|  |   Patches 1+2   |   Patches 1+3   | Relative %Change |
+--+-+-+--+
| fserver  | +1.7%   |  0.0%   | -1.7%|
| new_fserver  | -0.2%   | -1.5%   | -1.2%|
+--+-+-+--+
|   Workload   | Mean % Change 100-1000 users |
|  +-+-+--+
|  |   Patches 1+2   |   Patches 1+3   | Relative %Change |
+--+-+-+--+
| fserver  |+18.6%   |+43.4%   |+21.0%|
| new_fserver  |+14.0%   |+23.4%   | +8.2%|
+--+-+-+--+
|   Workload   | Mean % Change 1100-2000 users|
|  +-+-+--+
|  |   Patches 1+2   |   Patches 1+3   | Relative %Change |
+--+-+-+--+
| fserver  |+11.6%   | +5.1%   | -5.8%|
| new_fserver  |+13.3%   | +7.6%   | -5.0%|
+--+-+-+--+

So patch 2 is better at low and high load. Patch 3 is better at
intermediate load. For other AIM7 workloads, patch 3 is generally
better.

Waiman Long (3):
  mutex: Make more scalable by doing less atomic operations
  mutex: restrict mutex spinning to only one task per mutex
  mutex: dynamically disable mutex spinning at high load

 arch/x86/include/asm/mutex.h |   16 
 include/linux/mutex.h|3 +++
 kernel/mutex.c   |   21 ++---
 kernel/mutex.h   |8 
 kernel/sched/core.c  |   22 ++
 5 files changed, 67 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC 3/3] mutex: dynamically disable mutex spinning at high load

2013-04-04 Thread Waiman Long
The Linux mutex code has a MUTEX_SPIN_ON_OWNER configuration
option that was enabled by default in major distributions like Red
Hat. Allowing threads waiting on mutex to spin while the mutex owner
is running will theoretically reduce latency on the acquisition of
mutex at the expense of energy efficiency as the spinning threads
are doing no useful work.

This is not a problem on a lightly loaded system where the CPU may
be idle anyway. On a highly loaded system, the spinning tasks may be
blocking other tasks from running even if they have higher priority
because the spinning was done with preemption disabled.

This patch will disable mutex spinning if the current load is high
enough. The load is considered high if there are 2 or more active tasks
waiting to run on the current CPU. If there is only one task waiting,
it will check the average load at the past minute (calc_load_tasks).
If it is more than double the number of active CPUs, the load is
considered high too.  This is a rather simple metric that does not
incur that much additional overhead.

The AIM7 benchmarks were run on 3.7.10 derived kernels to show the
performance changes with this patch on a 8-socket 80-core system
with hyperthreading off.  The table below shows the mean % change
in performance over a range of users for some AIM7 workloads with
just the less atomic operation patch (patch 1) vs the less atomic
operation patch plus this one (patches 1+3).

+--+-+-+-+
|   Workload   | mean % change   | mean % change   | mean % change   |
|  | 10-100 users| 200-1000 users  | 1100-2000 users |
+--+-+-+-+
| alltests |  0.0%   | -0.1%   |+5.0%|
| five_sec | +1.5%   | +1.3%   |+1.3%|
| fserver  | +1.5%   |+25.4%   |+9.6%|
| high_systime | +0.1%   |  0.0%   |+0.8%|
| new_fserver  | +0.2%   |+11.9%   |   +14.1%|
| shared   | -1.2%   | +0.3%   |+1.8%|
| short| +6.4%   | +2.5%   |+3.0%|
+--+-+-+-+

It can be seen that this patch provides some big performance
improvement for the fserver and new_fserver workloads while is still
generally positive for the other AIM7 workloads.

Signed-off-by: Waiman Long 
Reviewed-by: Davidlohr Bueso 
---
 kernel/sched/core.c |   22 ++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..f667d63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3021,9 +3021,31 @@ static inline bool owner_running(struct mutex *lock, 
struct task_struct *owner)
  */
 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 {
+   unsigned int nrun;
+
if (!sched_feat(OWNER_SPIN))
return 0;
 
+   /*
+* Mutex spinning should be temporarily disabled if the load on
+* the current CPU is high. The load is considered high if there
+* are 2 or more active tasks waiting to run on this CPU. On the
+* other hand, if there is another task waiting and the global
+* load (calc_load_tasks - including uninterruptible tasks) is
+* bigger than 2X the # of CPUs available, it is also considered
+* to be high load.
+*/
+   nrun = this_rq()->nr_running;
+   if (nrun >= 3)
+   return 0;
+   else if (nrun == 2) {
+   long active = atomic_long_read(&calc_load_tasks);
+   int  ncpu   = num_online_cpus();
+
+   if (active > 2*ncpu)
+   return 0;
+   }
+
rcu_read_lock();
while (owner_running(lock, owner)) {
if (need_resched())
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 4/4] dcache: don't need to take d_lock in prepend_path()

2013-04-05 Thread Waiman Long
The d_lock was used in prepend_path() to protect dentry->d_name from
being changed under the hood. As the caller of prepend_path() has
to take the rename_lock before calling into it, there is no chance
that d_name will be changed. The d_lock lock is only needed when the
rename_lock is not taken.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 9477d80..e3d6543 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2529,6 +2529,7 @@ static int prepend_name(char **buffer, int *buflen, 
struct qstr *name)
  * @buflen: pointer to buffer length
  *
  * Caller holds the rename_lock.
+ * There is no need to lock the dentry as its name cannot be changed.
  */
 static int prepend_path(const struct path *path,
const struct path *root,
@@ -2555,9 +2556,7 @@ static int prepend_path(const struct path *path,
}
parent = dentry->d_parent;
prefetch(parent);
-   spin_lock(&dentry->d_lock);
error = prepend_name(buffer, buflen, &dentry->d_name);
-   spin_unlock(&dentry->d_lock);
if (!error)
error = prepend(buffer, buflen, "/", 1);
if (error)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 RFC 3/4] dcache: change rename_lock to a sequence read/write lock

2013-04-05 Thread Waiman Long
The d_path() and related kernel functions currently take a writer
lock on rename_lock because they need to follow pointers. By changing
rename_lock to be the new sequence read/write lock, a reader lock
can be taken and multiple d_path() threads can proceed concurrently
without blocking each other.

It is unlikely that the frequency of filesystem changes and d_path()
name lookup will be high enough to cause writer starvation, the current
limitation of the read/write lock should be acceptable in that case.

All the sites where rename_lock is referenced were modified to use the
sequence read/write lock declaration and access functions.

When apply this patch to 3.8 or earlier releases, the unused function
d_path_with_unreachable() in fs/dcache.c should be removed to avoid
compilation warning.

Signed-off-by: Waiman Long 
---
 fs/autofs4/waitq.c |6 ++--
 fs/ceph/mds_client.c   |4 +-
 fs/cifs/dir.c  |4 +-
 fs/dcache.c|   83 ---
 fs/nfs/namespace.c |6 ++--
 include/linux/dcache.h |4 +-
 kernel/auditsc.c   |4 +-
 7 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3db70da..3afc4db 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,7 +197,7 @@ rename_retry:
buf = *name;
len = 0;
 
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
spin_lock(&sbi->fs_lock);
for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
@@ -206,7 +206,7 @@ rename_retry:
if (!len || --len > NAME_MAX) {
spin_unlock(&sbi->fs_lock);
rcu_read_unlock();
-   if (read_seqretry(&rename_lock, seq))
+   if (read_seqrwretry(&rename_lock, seq))
goto rename_retry;
return 0;
}
@@ -222,7 +222,7 @@ rename_retry:
}
spin_unlock(&sbi->fs_lock);
rcu_read_unlock();
-   if (read_seqretry(&rename_lock, seq))
+   if (read_seqrwretry(&rename_lock, seq))
goto rename_retry;
 
return len;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 442880d..da565c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1486,7 +1486,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int 
*plen, u64 *base,
 
 retry:
len = 0;
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
for (temp = dentry; !IS_ROOT(temp);) {
struct inode *inode = temp->d_inode;
@@ -1536,7 +1536,7 @@ retry:
temp = temp->d_parent;
}
rcu_read_unlock();
-   if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+   if (pos != 0 || read_seqrwretry(&rename_lock, seq)) {
pr_err("build_path did not end path lookup where "
   "expected, namelen is %d, pos is %d\n", len, pos);
/* presumably this is only possible if racing with a
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 1cd0162..707d849 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -96,7 +96,7 @@ build_path_from_dentry(struct dentry *direntry)
dfsplen = 0;
 cifs_bp_rename_retry:
namelen = dfsplen;
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
for (temp = direntry; !IS_ROOT(temp);) {
namelen += (1 + temp->d_name.len);
@@ -136,7 +136,7 @@ cifs_bp_rename_retry:
}
}
rcu_read_unlock();
-   if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+   if (namelen != dfsplen || read_seqrwretry(&rename_lock, seq)) {
cFYI(1, "did not end path lookup where expected. namelen=%d "
"dfsplen=%d", namelen, dfsplen);
/* presumably this is only possible if racing with a rename
diff --git a/fs/dcache.c b/fs/dcache.c
index 48c0680..9477d80 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -82,7 +83,7 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+__cacheline_aligned_in_smp DEFINE_SEQRWLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
@@ -1020,7 +1021,7 @@ static struct dentry *try_to_ascend(struct dentry *old, 
int locked, unsigned seq
 */
if (new != old->d_parent ||
 (old->d_flags & DCACHE_DENTRY_KILLED) ||
-(!locked && read_seqretry(&rename_lock, seq))) {
+

[PATCH v2 0/4] dcache: make dcache more scalable on large system

2013-04-05 Thread Waiman Long
Change log:

v1->v2
 - Include performance improvement in the AIM7 benchmark results because
   of this patch.
 - Modify dget_parent() to avoid taking the lock, if possible, to further
   improve AIM7 benchmark results.

During some perf-record sessions of the kernel running the high_systime
workload of the AIM7 benchmark, it was found that quite a large portion
of the spinlock contention was due to the perf_event_mmap_event()
function itself. This perf kernel function calls d_path() which,
in turn, call path_get() and dput() indirectly. These 3 functions
were the hottest functions shown in the perf-report output of
the _raw_spin_lock() function in an 8-socket system with 80 cores
(hyperthreading off) with a 3.7.10 kernel with a mutex patch applied:

-  11.97%  reaim  [kernel.kallsyms] [k] _raw_spin_lock
   - _raw_spin_lock
  + 46.17% d_path
  + 20.31% path_get
  + 19.75% dput

In fact, the output of the "perf record -s -a" (without call-graph)
showed:

 11.73%  reaim  [kernel.kallsyms] [k] _raw_spin_lock
  8.85% ls  [kernel.kallsyms] [k] _raw_spin_lock
  3.97%   true  [kernel.kallsyms] [k] _raw_spin_lock

Without using the perf monitoring tool, the actual execution profile
will be quite different. In fact, with this patch set applied, the
output of the same "perf record -s -a" command became:

  2.05%  reaim  [kernel.kallsyms] [k] _raw_spin_lock
  0.30% ls  [kernel.kallsyms] [k] _raw_spin_lock
  0.25%   true  [kernel.kallsyms] [k] _raw_spin_lock

So the time spent on _raw_spin_lock() function went down from 24.55%
to 2.60%. It can be seen that the performance data collected by the
perf-record command can be heavily skewed in some cases on a system
with a large number of CPUs. This set of patches enables the perf
command to give a more accurate and reliable picture of what is really
happening in the system. At the same time, they can also improve the
general performance of systems especially those with a large number
of CPUs.

The d_path() function takes the following two locks:

1. dentry->d_lock [spinlock] from dget()/dget_parent()/dput()
2. rename_lock[seqlock]  from d_path()

This set of patches were designed to minimize the locking overhead
of these code paths.

The current kernel takes the dentry->d_lock lock whenever it wants to
increment or decrement the d_count reference count. However, nothing
big will really happen until the reference count goes all the way to 1
or 0.  Actually, we don't need to take the lock when reference count
is bigger than 1. Instead, atomic cmpxchg() function can be used to
increment or decrement the count in these situations. For safety,
other reference count update operations have to be changed to use
atomic instruction as well.

The rename_lock is a sequence lock. The d_path() function takes the
writer lock because it needs to traverse different dentries through
pointers to get the full path name. Hence it can't tolerate changes
in those pointers. But taking the writer lock also prevent multiple
d_path() calls to proceed concurrently.

A solution is to introduce a new lock type where there will be a
second type of reader which can block the writers - the sequence
read/write lock (seqrwlock). The d_path() and related functions will
then be changed to take the reader lock instead of the writer lock.
This will allow multiple d_path() operations to proceed concurrently.

Additional performance testing was conducted using the AIM7
benchmark.  It is mainly the first patch that has impact on the AIM7
benchmark. Please see the patch description of the first patch on
more information about the benchmark results.

Incidentally, these patches also have a favorable impact on Oracle
database performance when measured by the Oracle SLOB benchmark.

The following tests with multiple threads were also run on kernels
with and without the patch on an 8-socket 80-core system and a PC
with 4-core i5 processor:

1. find $HOME -size 0b
2. cat /proc/*/maps /proc/*/numa_maps
3. git diff

For both the find-size and cat-maps tests, the performance difference
with hot cache was within a few percentage points and hence within
the margin of error. Single-thread performance was slightly worse,
but multithread performance was generally a bit better. Apparently,
reference count update isn't a significant factor in those tests. Their
perf traces indicates that there was less spinlock content in
functions like dput(), but the function itself ran a little bit longer
on average.

The git-diff test showed no difference in performance. There is a
slight increase in system time compensated by a slight decrease in
user time.

Of the 4 patches, patch 3 is dependent on patch 2. The other 2 patches
are independent can be applied individually.

Signed-off-by: Waiman Long 

Waiman Long (4):
  dcache: Don't take unnecessary lock in d_count update
  dcac

[PATCH RFC v2 2/4] dcache: introduce a new sequence read/write lock type

2013-04-05 Thread Waiman Long
The current sequence lock supports 2 types of lock users:

1. A reader who wants a consistent set of information and is willing
   to retry if the information changes. The information that the
   reader needs cannot contain pointers, because any writer could
   invalidate a pointer that a reader was following. This reader
   will never block but they may have to retry if a writer is in
   progress.
2. A writer who may need to modify content of a data structure. Writer
   blocks only if another writer is in progress.

This type of lock is suitable for cases where there are a large number
of readers and much less writers. However, it has a limitation that
reader who may want to follow pointer or cannot tolerate unexpected
changes in the protected data structure must take the writer lock
even if it doesn't need to make any changes.

To more efficiently support this type of readers, a new lock type is
introduced by this patch: sequence read/write lock. Two types of readers
are supported by this new lock:

1. Reader who has the same behavior as a sequence lock reader.
2. Reader who may need to follow pointers. This reader will block if
   a writer is in progress. In turn, it blocks a writer if it is in
   progress. Multiple readers of this type can proceed concurrently.
   Taking this reader lock won't update the sequence number.

This new lock type is a combination of the sequence lock and read/write
lock. Hence it will have the same limitation of a read/write lock that
writers may be starved if there is a lot of contention.

Signed-off-by: Waiman Long 
---
 include/linux/seqrwlock.h |  137 +
 1 files changed, 137 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/seqrwlock.h

diff --git a/include/linux/seqrwlock.h b/include/linux/seqrwlock.h
new file mode 100644
index 000..c6145ff
--- /dev/null
+++ b/include/linux/seqrwlock.h
@@ -0,0 +1,137 @@
+#ifndef __LINUX_SEQRWLOCK_H
+#define __LINUX_SEQRWLOCK_H
+/*
+ * Sequence Read/Write Lock
+ * 
+ * This new lock type is a combination of the sequence lock and read/write
+ * lock. Three types of lock users are supported:
+ * 1. Reader who wants a consistent set of information and is willing to
+ *retry if the information changes. The information that the reader
+ *need cannot contain pointers, because any writer could invalidate
+ *a pointer that a reader was following. This reader never block but
+ *they may have to retry if a writer is in progress.
+ * 2. Reader who may need to follow pointers. This reader will block if
+ *a writer is in progress.
+ * 3. Writer who may need to modify content of a data structure. Writer
+ *blocks if another writer or the 2nd type of reader is in progress.
+ *
+ * The current implementation layered on top of the regular read/write
+ * lock. There is a chance that the writers may be starved by the readers.
+ * So be careful when you decided to use this lock.
+ *
+ * Expected 1st type reader usage:
+ * do {
+ * seq = read_seqrwbegin(&foo);
+ * ...
+ *  } while (read_seqrwretry(&foo, seq));
+ *
+ * Expected 2nd type reader usage:
+ * read_seqrwlock(&foo)
+ * ...
+ * read_seqrwunlock(&foo)
+ *
+ * Expected writer usage:
+ * write_seqrwlock(&foo)
+ * ...
+ * write_seqrwunlock(&foo)
+ *
+ * Based on the seqlock.h file
+ * by Waiman Long
+ */
+
+#include 
+#include 
+#include 
+
+typedef struct {
+   unsigned sequence;
+   rwlock_t lock;
+} seqrwlock_t;
+
+#define __SEQRWLOCK_UNLOCKED(lockname) \
+{ 0, __RW_LOCK_UNLOCKED(lockname) }
+
+#define seqrwlock_init(x)  \
+   do {\
+   (x)->sequence = 0;  \
+   rwlock_init(&(x)->lock);\
+   } while (0)
+
+#define DEFINE_SEQRWLOCK(x) \
+   seqrwlock_t x = __SEQRWLOCK_UNLOCKED(x)
+
+/* For writer:
+ * Lock out other writers and 2nd type of readers and update the sequence
+ * number. Don't need preempt_disable() because that is in the read_lock and
+ * write_lock already.
+ */
+static inline void write_seqrwlock(seqrwlock_t *sl)
+{
+   write_lock(&sl->lock);
+   ++sl->sequence;
+   smp_wmb();
+}
+
+static inline void write_seqrwunlock(seqrwlock_t *sl)
+{
+   smp_wmb();
+   sl->sequence++;
+   write_unlock(&sl->lock);
+}
+
+static inline int write_tryseqrwlock(seqrwlock_t *sl)
+{
+   int ret = write_trylock(&sl->lock);
+
+   if (ret) {
+   ++sl->sequence;
+   smp_wmb();
+   }
+   return ret;
+}
+
+/* For 2nd type of reader:
+ * Lock out other writers, but don't update the sequence number
+ */
+static inline void read_seqrwlock(seqrwlock_t *sl)
+{
+   read_lock(&sl->lock);
+}
+
+static inline void read_seqrwunlock(se

[PATCH v2 1/4] dcache: Don't take unnecessary lock in d_count update

2013-04-05 Thread Waiman Long
 Almost all of which
can be attributed to the following 2 kernel functions:
 1. dget_parent (50.14%)
 2. dput (49.48%)

With this patch on, the time spent on _raw_spin_lock() is only 1.31%
which is a huge improvement.

This impact of this patch on other AIM7 workloads were much more
modest.  The table below show the mean %change due to this patch on
the same 8-socket system with a 3.7.10 kernel.

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| alltests | +0.6% | +9.7%  | +2.9%   |
| five_sec | +0.4% |  0.0%  | +0.3%   |
| fserver  | +0.7% | +2.4%  | +2.0%   |
| high_systime | -1.5% |-15.2%  |-38.0%   |
| new_fserver  | -2.2% | +6.5%  | +0.4%   |
| shared   | +3.9% | +1.1%  | +6.1%   |
+--+---++-+

The regression in the high_systime workload was probably caused
by the decrease in spinlock contention led to a larger increase in
mutex contention. In fact, after applying a mutex patch to reduce
mutex contention, the performance difference due to the addition of
this dcache patch changed to:

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| high_systime | -0.1% | -0.2%  | +1.2%   |
+--+---++-----+

Signed-off-by: Waiman Long 
---
 fs/dcache.c|   39 --
 fs/namei.c |2 +-
 include/linux/dcache.h |  101 ++-
 3 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index fbfae00..48c0680 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -484,7 +484,7 @@ relock:
}
 
if (ref)
-   dentry->d_count--;
+   dcount_dec(dentry);
/*
 * if dentry was on the d_lru list delete it from there.
 * inform the fs via d_prune that this dentry is about to be
@@ -530,10 +530,13 @@ void dput(struct dentry *dentry)
 repeat:
if (dentry->d_count == 1)
might_sleep();
+   if (dcount_dec_cmpxchg(dentry))
+   return;
+
spin_lock(&dentry->d_lock);
BUG_ON(!dentry->d_count);
if (dentry->d_count > 1) {
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
}
@@ -550,7 +553,7 @@ repeat:
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
 
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
 
@@ -621,11 +624,13 @@ EXPORT_SYMBOL(d_invalidate);
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
 {
-   dentry->d_count++;
+   dcount_inc(dentry);
 }
 
 static inline void __dget(struct dentry *dentry)
 {
+   if (dcount_inc_cmpxchg(dentry))
+   return;
spin_lock(&dentry->d_lock);
__dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
@@ -635,22 +640,14 @@ struct dentry *dget_parent(struct dentry *dentry)
 {
struct dentry *ret;
 
-repeat:
-   /*
-* Don't need rcu_dereference because we re-check it was correct under
-* the lock.
-*/
rcu_read_lock();
-   ret = dentry->d_parent;
-   spin_lock(&ret->d_lock);
-   if (unlikely(ret != dentry->d_parent)) {
-   spin_unlock(&ret->d_lock);
-   rcu_read_unlock();
-   goto repeat;
-   }
+   ret = rcu_dereference(dentry->d_parent);
rcu_read_unlock();
+   if (dcount_inc_cmpxchg(ret))
+   return ret;
+   spin_lock(&ret->d_lock);
BUG_ON(!ret->d_count);
-   ret->d_count++;
+   dcount_inc(ret);
spin_unlock(&ret->d_lock);
return ret;
 }
@@ -780,7 +777,7 @@ static void try_prune_one_dentry(struct dentry *dentry)
while (dentry) {
spin_lock(&dentry->d_lock);
if (dentry->d_count > 1) {
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
}
@@ -1981,7 +1978,7 @@ struct dentry *__d_lookup(const struct dentry *parent, 

Re: [PATCH v2 1/4] dcache: Don't take unnecessary lock in d_count update

2013-04-05 Thread Waiman Long

On 04/05/2013 01:12 PM, Al Viro wrote:

@@ -635,22 +640,14 @@ struct dentry *dget_parent(struct dentry *dentry)
  {
struct dentry *ret;

-repeat:
-   /*
-* Don't need rcu_dereference because we re-check it was correct under
-* the lock.
-*/
rcu_read_lock();
-   ret = dentry->d_parent;
-   spin_lock(&ret->d_lock);
-   if (unlikely(ret != dentry->d_parent)) {
-   spin_unlock(&ret->d_lock);
-   rcu_read_unlock();
-   goto repeat;
-   }
+   ret = rcu_dereference(dentry->d_parent);
rcu_read_unlock();
+   if (dcount_inc_cmpxchg(ret))
+   return ret;
+   spin_lock(&ret->d_lock);

And WTF is going to protect your "ret" from being freed just as you'd done
rcu_read_unlock()?


I think I had made a mistake here. I should move the rcu_read_unlock() 
down to before the return statement as well as after the spin_lock(). 
Thank for pointing this out. I will fix that in the next version. 
Anything else that needs to be fixed?


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] dcache: make Oracle more scalable on large systems

2013-02-28 Thread Waiman Long

On 02/22/2013 07:13 PM, Andi Kleen wrote:

That seems to me like an application problem - poking at what the
kernel is doing via diagnostic interfaces so often that it gets in
the way of the kernel actually doing stuff is not a problem the
kernel can solve.

I agree with you that the application shouldn't be doing that, but
if there is a cheap way to lower the d_path overhead that is also
attractive.  There will be always applications doing broken things.
Any scaling problem less in the kernel is good.

But the real fix in this case is to fix the application.

-Andi

Further investigation into the d_path() bottleneck revealed some
interesting facts about Oracle. First of all, the invocation of the
d_path() kernel function is concentrated in a few processes only
rather than distributed across many of them.  On a 1 minute test run,
the following three standard long-running Oracle processes will call
indo d_path():

1. MMNL - Memory monitor light (gathers and stores AWR statistics) [272]
2. CKPT - Checkpoint process [17]
3. DBRM - DB resource manager (new in 11g) [16]

The numbers within [] are the number of times d_path() will be
called, which are not much for a 1-minutes interval. Beyond those
standard processes, Oracle also seems to spawn transient processes
(last a few seconds) periodically to issue a bunch of d_path() calls
(about 1000) within a short time before they die. I am not sure what
the purpose of those processes are.  In an one minute interval, 2-7
of those transient processes may be spawned depending probably on the
activity level. Most of the d_path() call last for about 1ms. There
are a couple of those that last for more than 10ms.

Other system daemons that call into d_open() include irqbalance and
automount. irqbalance issues about 2000 d_path() call in a minute in
a bursty fashion. The contribution of automount is only about 50 in
the same time period which is not really significant. Regular commands
like cp, ps may also issue a couple of d_path() calls per invocation.

As I was using "perf record --call-graph" command to profile the Oracle
application, I found out that another major user of the d_path()
function happens to be perf_event_mmap_event() of the perf-event
subsystem. It took about 10% of the total d_path() calls. So the
metrics that I collected were skewed a bit because of that.

I am thinking that the impact of my patch on Oracle write performance
is probably due to its impact on the open() system call which has
to update the reference counts on dentry. On the collected perf
traces, a certain portion of the spinlock time was consumed by
dput() and path_get().  The 2 major consumers of those calls are
the d_path() and the open() system call. On test run with no writer,
I saw significantly less open() call in the perf trace and hence much
less impact on Oracle performance.

I do agree that Oracle should probably fix the application to issue less
calls to the d_path() function. However, I would argue that my patch will
still be useful for the following reasons:

1. Changing how the reference counting works (patch 1 of 4) will certainly
   help in situation when processes are issuing intensive batches of file
   system operations as is the case here.
2. Changing the rename_lock to use a sequence r/w lock (patches 2-4 of 4)
   will help to minimize the overhead of the perf-event subsystem when it
   is activated with the call-graph feature which is pretty common.

Regards,
Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] dcache: make Oracle more scalable on large systems

2013-02-28 Thread Waiman Long

On 02/28/2013 03:39 PM, Waiman Long wrote:


activity level. Most of the d_path() call last for about 1ms. There
are a couple of those that last for more than 10ms.



A correction. The time unit here should be us, not ms. Sorry for the 
mistake.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] dcache: make Oracle more scalable on large systems

2013-02-19 Thread Waiman Long
It was found that the Oracle database software issues a lot of call
to the seq_path() kernel function which translates a (dentry, mnt)
pair to an absolute path. The seq_path() function will eventually
take the following two locks:

1. dentry->d_lock (spinlock) from dget()/dput()
2. rename_lock(seqlock)  from d_path()

With a lot of database activities, the spinning of the 2 locks takes
a major portion of the kernel time and slow down the database software.

This set of patches were designed to minimize the locking overhead of
this code path and improve Oracle performance on systems with a large
number of CPUs.

The current kernel takes the dentry->d_lock lock whenever it wants to
increment or decrement the d_count reference count. However, nothing
big will really happen until the reference count goes all the way to 1
or 0.  Actually, we don't need to take the lock when reference count
is bigger than 1. Instead, atomic cmpxchg() function can be used to
increment or decrement the count in these situations. For safety,
other reference count update operations have to be changed to use
atomic instruction as well.

The rename_lock is a sequence lock. The d_path() function takes the
writer lock because it needs to traverse different dentries through
pointers to get the full path name. Hence it can't tolerate changes
in those pointers. But taking the writer lock also prevent multiple
d_path() calls to proceed concurrently.

A solution is to introduce a new lock type where there will be a
second type of reader which can block the writers - the sequence
read/write lock (seqrwlock). The d_path() and related functions will
then be changed to take the reader lock instead of the writer lock.
This will allow multiple d_path() operations to proceed concurrently.

Performance testing was done using the Oracle SLOB benchmark with the
latest 11.2.0.3 release of Oracle on a 3.8-rc3 kernel. Database files
were put in a tmpfs partition to minimize physical I/O overhead. Huge
pages were used with 30GB of SGA. The test machine was an 8-socket,
80-core HP Proliant DL980 with 1TB of memory and hyperthreading off.
The tests were run 5 times and the averages were taken.

The patch only has a slight positive impact on logical read
performance. The impact on write (redo size) performance, however,
is much greater. The redo size is a proxy of how much database write
has happened. So a larger value means a higher transaction rate.

+-+-+-++--+
| Readers | Writers | Redo Size   | Redo Size  | % Change |
| | | w/o patch   | with patch |  |
| | |   (MB/s)|   (MB/s)   |  |
+-+-+-++--+
|8|   64|802  |903 |  12.6%   |
|   32|   64|798  |892 |  11.8%   |
|   80|   64|658  |714 |   8.5%   |
|  128|   64|748  |907 |  21.3%   |
+-+-+-++--+

The table below shows the %system and %user times reported by Oracle's
AWR tool as well as the %time spent in the spinlocking code in kernel
with (inside parenthesis) and without (outside parenthesis) the patch.

+-+-++++
| Readers | Writers |  % System  |   % User   | % spinlock |
+-+-++++
|   32|0|  0.3(0.3)  | 39.0(39.0) |  6.3(17.4) |
|   80|0|  0.7(0.7)  | 97.4(94.2) |  2.9(31.7) |
|  128|0|  1.4(1.4)  | 34.4(32.2) | 43.5(62.2) |
|   32|   64|  3.8(3.5)  | 55.4(53.6) |  9.1(35.0) |
|   80|   64|  3.0(2.9)  | 94.4(93.9) |  4.5(38.8) |
|  128|   64|  4.7(4.3)  | 38.2(40.3) | 34.8(58.7) |
+-+-++++

The following tests with multiple threads were also run on kernels with
and without the patch on both DL980 and a PC with 4-core i5 processor:

1. find $HOME -size 0b
2. cat /proc/*/maps /proc/*/numa_maps
3. git diff

For both the find-size and cat-maps tests, the performance difference
with hot cache was within a few percentage points and hence within
the margin of error. Single-thread performance was slightly worse,
but multithread performance was generally a bit better. Apparently,
reference count update isn't a significant factor in those tests. Their
perf traces indicates that there was less spinlock content in
functions like dput(), but the function itself ran a little bit longer
on average.

The git-diff test showed no difference in performance. There is a
slight increase in system time compensated by a slight decrease in
user time.

Signed-off-by: Waiman Long 

Waiman Long (4):
  dcache: Don't take unncessary lock in d_count update
  dcache: introduce a new sequence read/write lock type
  dcache: change rename_lock to a sequence read/write lock
  dcache: don't

[PATCH 1/4] dcache: Don't take unncessary lock in d_count update

2013-02-19 Thread Waiman Long
The current code takes the dentry's d_lock lock whenever the d_count
reference count is being updated. In reality, nothing big really
happens until d_count goes to 0 in dput(). So it is not necessary to
take the lock if the reference count won't go to 0.

Without using a lock, multiple threads may update d_count
simultaneously.  Therefore, atomic instructions must be used to
ensure consistency except in shrink_dcache_for_umount*() where the
whole superblock is being dismounted and locking is not needed.

The worst case scenarios are:

1. d_lock taken in dput with d_count = 2 in one thread and another
   thread comes in to atomically decrement d_count without taking
   the lock. This may result in a d_count of 0 with no deleting
   action taken.

2. d_lock taken in dput with d_count = 1 in one thread and another
   thread comes in to atomically increment d_count without taking
   the lock. This may result in the dentry in the deleted state while
   having a d_count of 1.

Without taking a lock, we need to make sure the decrementing or
incrementing action should not be taken while other threads are
updating d_count simultaneously. This can be done by using the
atomic cmpxchg instruction which will fail if the underlying value
is changed.  If the lock is taken, it should be safe to use a simpler
atomic increment or decrement instruction.

To make sure that the above worst case scenerios will not happen,
the dget() function must take the lock if d_count <= 1. Similarly,
the dput() function must take the lock if d_count <= 2. The cmpxchg()
call to update d_count will be tried twice before falling back to
using the lock as there is a fairly good chance that the cmpxchg()
may fail in a busy situation.

Finally, the CPU must have an instructional level cmpxchg instruction
or the emulated cmpxchg() function may be too expensive to
use. Therefore, the above mentioned changes will only be applied if
the __HAVE_ARCH_CMPXCHG flag is set. Most of the major architectures
supported by Linux have this flag set with the notation exception
of ARM.

As for the performance of the updated reference counting code, it
all depends on whether the cmpxchg instruction is used or not. The
original code has 2 atomic instructions to lock and unlock the
spinlock. The new code path has either 1 atomic cmpxchg instruction
or 3 atomic instructions if the lock has to be taken. Depending on
how frequent the cmpxchg instruction is used (d_count > 1 or 2),
the new code can be faster or slower than the original one.

Signed-off-by: Waiman Long 
---
 fs/dcache.c|   23 ++
 fs/namei.c |2 +-
 include/linux/dcache.h |  105 ++-
 3 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 19153a0..20cc789 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -484,7 +484,7 @@ relock:
}
 
if (ref)
-   dentry->d_count--;
+   dcount_dec(dentry);
/*
 * if dentry was on the d_lru list delete it from there.
 * inform the fs via d_prune that this dentry is about to be
@@ -530,10 +530,13 @@ void dput(struct dentry *dentry)
 repeat:
if (dentry->d_count == 1)
might_sleep();
+   if (dcount_dec_cmpxchg(dentry))
+   return;
+
spin_lock(&dentry->d_lock);
BUG_ON(!dentry->d_count);
if (dentry->d_count > 1) {
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
}
@@ -550,7 +553,7 @@ repeat:
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
 
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
 
@@ -621,11 +624,13 @@ EXPORT_SYMBOL(d_invalidate);
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
 {
-   dentry->d_count++;
+   dcount_inc(dentry);
 }
 
 static inline void __dget(struct dentry *dentry)
 {
+   if (dcount_inc_cmpxchg(dentry))
+   return;
spin_lock(&dentry->d_lock);
__dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
@@ -650,7 +655,7 @@ repeat:
}
rcu_read_unlock();
BUG_ON(!ret->d_count);
-   ret->d_count++;
+   dcount_inc(ret);
spin_unlock(&ret->d_lock);
return ret;
 }
@@ -782,7 +787,7 @@ static void try_prune_one_dentry(struct dentry *dentry)
while (dentry) {
spin_lock(&dentry->d_lock);
if (dentry->d_count > 1) {
-   dentry->d_count--;
+   dcount_dec(dentry);
spin_unlock(&dentry->d_lock);
return;
}
@@ -1980,7 +198

[PATCH 4/4] dcache: don't need to take d_lock in prepend_path()

2013-02-19 Thread Waiman Long
The d_lock was used in prepend_path() to protect dentry->d_name from
being changed under the hood. As the caller of prepend_path() has
to take the rename_lock before calling into it, there is no chance
that d_name will be changed. The d_lock lock is only needed when the
rename_lock is not taken.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index b1487e2..0e911fc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2547,6 +2547,7 @@ static int prepend_name(char **buffer, int *buflen, 
struct qstr *name)
  * @buflen: pointer to buffer length
  *
  * Caller holds the rename_lock.
+ * There is no need to lock the dentry as its name cannot be changed.
  */
 static int prepend_path(const struct path *path,
const struct path *root,
@@ -2573,9 +2574,7 @@ static int prepend_path(const struct path *path,
}
parent = dentry->d_parent;
prefetch(parent);
-   spin_lock(&dentry->d_lock);
error = prepend_name(buffer, buflen, &dentry->d_name);
-   spin_unlock(&dentry->d_lock);
if (!error)
error = prepend(buffer, buflen, "/", 1);
if (error)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] dcache: change rename_lock to a sequence read/write lock

2013-02-19 Thread Waiman Long
The d_path() and related kernel functions currently take a writer
lock on rename_lock because they need to follow pointers. By changing
rename_lock to be the new sequence read/write lock, a reader lock
can be taken and multiple d_path() threads can proceed concurrently
without blocking each other.

It is unlikely that the frequency of filesystem changes and d_path()
name lookup will be high enough to cause writer starvation, the current
limitation of the read/write lock should be acceptable in that case.

All the sites where rename_lock is referenced were modified to use the
sequence read/write lock declaration and access functions.

Signed-off-by: Waiman Long 
---
 fs/autofs4/waitq.c |6 ++--
 fs/ceph/mds_client.c   |4 +-
 fs/cifs/dir.c  |4 +-
 fs/dcache.c|   87 ---
 fs/nfs/namespace.c |6 ++--
 include/linux/dcache.h |4 +-
 kernel/auditsc.c   |5 ++-
 7 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 03bc1d3..95eee02 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -199,7 +199,7 @@ rename_retry:
buf = *name;
len = 0;
 
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
spin_lock(&sbi->fs_lock);
for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
@@ -208,7 +208,7 @@ rename_retry:
if (!len || --len > NAME_MAX) {
spin_unlock(&sbi->fs_lock);
rcu_read_unlock();
-   if (read_seqretry(&rename_lock, seq))
+   if (read_seqrwretry(&rename_lock, seq))
goto rename_retry;
return 0;
}
@@ -224,7 +224,7 @@ rename_retry:
}
spin_unlock(&sbi->fs_lock);
rcu_read_unlock();
-   if (read_seqretry(&rename_lock, seq))
+   if (read_seqrwretry(&rename_lock, seq))
goto rename_retry;
 
return len;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8..da6bd2c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1458,7 +1458,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int 
*plen, u64 *base,
 
 retry:
len = 0;
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
for (temp = dentry; !IS_ROOT(temp);) {
struct inode *inode = temp->d_inode;
@@ -1508,7 +1508,7 @@ retry:
temp = temp->d_parent;
}
rcu_read_unlock();
-   if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+   if (pos != 0 || read_seqrwretry(&rename_lock, seq)) {
pr_err("build_path did not end path lookup where "
   "expected, namelen is %d, pos is %d\n", len, pos);
/* presumably this is only possible if racing with a
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8719bbe..4842523 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -96,7 +96,7 @@ build_path_from_dentry(struct dentry *direntry)
dfsplen = 0;
 cifs_bp_rename_retry:
namelen = dfsplen;
-   seq = read_seqbegin(&rename_lock);
+   seq = read_seqrwbegin(&rename_lock);
rcu_read_lock();
for (temp = direntry; !IS_ROOT(temp);) {
namelen += (1 + temp->d_name.len);
@@ -136,7 +136,7 @@ cifs_bp_rename_retry:
}
}
rcu_read_unlock();
-   if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+   if (namelen != dfsplen || read_seqrwretry(&rename_lock, seq)) {
cFYI(1, "did not end path lookup where expected. namelen=%d "
"dfsplen=%d", namelen, dfsplen);
/* presumably this is only possible if racing with a rename
diff --git a/fs/dcache.c b/fs/dcache.c
index 20cc789..b1487e2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -82,7 +83,7 @@ int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+__cacheline_aligned_in_smp DEFINE_SEQRWLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
@@ -1030,7 +1031,7 @@ static struct dentry *try_to_ascend(struct dentry *old, 
int locked, unsigned seq
 */
if (new != old->d_parent ||
 (old->d_flags & DCACHE_DENTRY_KILLED) ||
-(!locked && read_seqretry(&rename_lock, seq))) {
+(!locked && read_seqrwretry(&rename_lock, seq))) {
spin_unlock(&new->d_lock);
new = NULL;
   

[PATCH 2/4] dcache: introduce a new sequence read/write lock type

2013-02-19 Thread Waiman Long
The current sequence lock supports 2 types of lock users:

1. A reader who wants a consistent set of information and is willing
   to retry if the information changes. The information that the
   reader needs cannot contain pointers, because any writer could
   invalidate a pointer that a reader was following. This reader
   will never block but they may have to retry if a writer is in
   progress.
2. A writer who may need to modify content of a data structure. Writer
   blocks only if another writer is in progress.

This type of lock is suitable for cases where there are a large number
of readers and much less writers. However, it has a limitation that
reader who may want to follow pointer or cannot tolerate unexpected
changes in the protected data structure must take the writer lock
even if it doesn't need to make any changes.

To more efficiently support this type of readers, a new lock type is
introduced by this patch: sequence read/write lock. Two types of readers
are supported by this new lock:

1. Reader who has the same behavior as a sequence lock reader.
2. Reader who may need to follow pointers. This reader will block if
   a writer is in progress. In turn, it blocks a writer if it is in
   progress. Multiple readers of this type can proceed concurrently.
   Taking this reader lock won't update the sequence number.

This new lock type is a combination of the sequence lock and read/write
lock. Hence it will have the same limitation of a read/write lock that
writers may be starved if there is a lot of contention.

Signed-off-by: Waiman Long 
---
 include/linux/seqrwlock.h |  138 +
 1 files changed, 138 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/seqrwlock.h

diff --git a/include/linux/seqrwlock.h b/include/linux/seqrwlock.h
new file mode 100644
index 000..3ff5119
--- /dev/null
+++ b/include/linux/seqrwlock.h
@@ -0,0 +1,138 @@
+#ifndef __LINUX_SEQRWLOCK_H
+#define __LINUX_SEQRWLOCK_H
+/*
+ * Sequence Read/Write Lock
+ * 
+ * This new lock type is a combination of the sequence lock and read/write
+ * lock. Three types of lock users are supported:
+ * 1. Reader who wants a consistent set of information and is willing to
+ *retry if the information changes. The information that the reader
+ *need cannot contain pointers, because any writer could invalidate
+ *a pointer that a reader was following. This reader never block but
+ *they may have to retry if a writer is in progress.
+ * 2. Reader who may need to follow pointers. This reader will block if
+ *a writer is in progress.
+ * 3. Writer who may need to modify content of a data structure. Writer
+ *blocks if another writer or the 2nd type of reader is in progress.
+ *
+ * The current implementation layered on top of the regular read/write
+ * lock. There is a chance that the writers may be starved by the readers.
+ * So be careful when you decided to use this lock.
+ *
+ * Expected 1st type reader usage:
+ * do {
+ * seq = read_seqrwbegin(&foo);
+ * ...
+ * } while (read_seqrwretry(&foo, seq));
+ *
+ * Expected 2nd type reader usage:
+ * read_seqrwlock(&foo)
+ * ...
+ * read_seqrwunlock(&foo)
+ *
+ * Expected writer usage:
+ * write_seqrwlock(&foo)
+ * ...
+ * write_seqrwunlock(&foo)
+ *
+ * Based on the seqlock.h file
+ * by Waiman Long
+ */
+
+#include 
+#include 
+#include 
+
+typedef struct {
+   unsigned sequence;
+   rwlock_t lock;
+} seqrwlock_t;
+
+#define __SEQRWLOCK_UNLOCKED(lockname) \
+{ 0, __RW_LOCK_UNLOCKED(lockname) }
+
+#define seqrwlock_init(x)  \
+   do {\
+   (x)->sequence = 0;  \
+   rwlock_init(&(x)->lock);\
+   } while (0)
+
+#define DEFINE_SEQRWLOCK(x) \
+   seqrwlock_t x = __SEQRWLOCK_UNLOCKED(x)
+
+/* For writer:
+ * Lock out other writers and 2nd type of readers and update the sequence
+ * number. Don't need preempt_disable() because that is in the read_lock and
+ * write_lock already.
+ */
+static inline void write_seqrwlock(seqrwlock_t *sl)
+{
+   write_lock(&sl->lock);
+   ++sl->sequence;
+   smp_wmb();
+}
+
+static inline void write_seqrwunlock(seqrwlock_t *sl)
+{
+   smp_wmb();
+   sl->sequence++;
+   write_unlock(&sl->lock);
+}
+
+static inline int write_tryseqrwlock(seqrwlock_t *sl)
+{
+   int ret = write_trylock(&sl->lock);
+
+   if (ret) {
+   ++sl->sequence;
+   smp_wmb();
+   }
+   return ret;
+}
+
+/* For 2nd type of reader:
+ * Lock out other writers, but don't update the sequence number
+ */
+static inline void read_seqrwlock(seqrwlock_t *sl)
+{
+   read_lock(&sl->lock);
+}
+
+static inline void read_seqrwunlock(se

Re: [PATCH 0/4] dcache: make Oracle more scalable on large systems

2013-02-21 Thread Waiman Long

On 02/21/2013 07:13 PM, Andi Kleen wrote:

Dave Chinner  writes:


On Tue, Feb 19, 2013 at 01:50:55PM -0500, Waiman Long wrote:

It was found that the Oracle database software issues a lot of call
to the seq_path() kernel function which translates a (dentry, mnt)
pair to an absolute path. The seq_path() function will eventually
take the following two locks:

Nobody should be doing reverse dentry-to-name lookups in a quantity
sufficient for it to become a performance limiting factor. What is
the Oracle DB actually using this path for?

Yes calling d_path frequently is usually a bug elsewhere.
Is that through /proc ?

-Andi


A sample strace of Oracle indicates that it opens a lot of /proc 
filesystem files such as the stat, maps, etc many times while running. 
Oracle has a very detailed system performance reporting infrastructure 
in place to report almost all aspect of system performance through its 
AWR reporting tool or the browser-base enterprise manager. Maybe that is 
the reason why it is hitting this performance bottleneck.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC v2 1/2] qspinlock: Introducing a 4-byte queue spinlock implementation

2013-08-26 Thread Waiman Long

On 08/22/2013 09:28 AM, Alexander Fyodorov wrote:

22.08.2013, 05:04, "Waiman Long":

On 08/21/2013 11:51 AM, Alexander Fyodorov wrote:
In this case, we should have smp_wmb() before freeing the lock. The
question is whether we need to do a full mb() instead. The x86 ticket
spinlock unlock code is just a regular add instruction except for some
exotic processors. So it is a compiler barrier but not really a memory
fence. However, we may need to do a full memory fence for some other
processors.

The thing is that x86 ticket spinlock code does have full memory barriers both in lock() and 
unlock() code: "add" instruction there has "lock" prefix which implies a full 
memory barrier. So it is better to use smp_mb() and let each architecture define it.


I also thought that the x86 spinlock unlock path was an atomic add. It 
just comes to my realization recently that this is not the case. The 
UNLOCK_LOCK_PREFIX will be mapped to "" except for some old 32-bit x86 
processors.



At this point, I am inclined to have either a smp_wmb() or smp_mb() at
the beginning of the unlock function and a barrier() at the end.

As the lock/unlock functions can be inlined, it is possible that a
memory variable can be accessed earlier in the calling function and the
stale copy may be used in the inlined lock/unlock function instead of
fetching a new copy. That is why I prefer a more liberal use of
ACCESS_ONCE() for safety purpose.

That is impossible: both lock() and unlock() must have either full memory 
barrier or an atomic operation which returns value. Both of them prohibit 
optimizations and compiler cannot reuse any global variable. So this usage of 
ACCESS_ONCE() is unneeded.

You can read more on this in Documentation/volatile-considered-harmful.txt

And although I already suggested that, have you read 
Documentation/memory-barriers.txt? There is a lot of valuable information there.


I did read Documentation/memory-barriers.txt. I will read 
volatile-considered-harmful.txt.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-29 Thread Waiman Long

On 08/28/2013 09:40 PM, Linus Torvalds wrote:

Just FYI: I've merged two preparatory patches in my tree for the whole
lockref thing. Instead of applying your four patches as-is during the
merge window, I ended up writing two patches that introduce the
concept and use it in the dentry code *without* introducing any of the
new semantics yet.

Waiman, I attributed the patches to you, even if they don't actually
look much like any of the patches you sent out. And because I was
trying very hard to make sure that no actual semantics changed, my
version doesn't have the dget_parent() lockless update code, for
example. I literally just did a search-and-replace of "->d_count" with
"->d_lockref.count" and then I fixed up a few things by hand (undid
one replacement in a comment, and used the helper functions where they
were semantically identical).

  You don't have to rewrite your patches if you don't want to, I'm
planning on cherry-picking the actual code changes during the merge
window.

   Linus


Thanks for merging the 2 preparatory patches for me. I will rebase my 
patches with the latest linux git tree. A new v8 patch set will be sent 
out sometime next week. I am looking forward to the v3.12 merge window 
which I think is coming soon.


Cheer,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC v2 1/2] qspinlock: Introducing a 4-byte queue spinlock implementation

2013-08-29 Thread Waiman Long

On 08/27/2013 08:09 AM, Alexander Fyodorov wrote:

I also thought that the x86 spinlock unlock path was an atomic add. It
just comes to my realization recently that this is not the case. The
UNLOCK_LOCK_PREFIX will be mapped to "" except for some old 32-bit x86
processors.

Hmm, I didn't know that. Looking through Google found these rules for x86 
memory ordering:
   * Loads are not reordered with other loads.
   * Stores are not reordered with other stores.
   * Stores are not reordered with older loads.
So x86 memory model is rather strict and memory barrier is really not needed in the unlock path - 
xadd is a store and thus behaves like a memory barrier, and since only lock's owner modifies 
"ticket.head" the "add" instruction need not be atomic.

But this is true only for x86, other architectures have more relaxed memory 
ordering. Maybe we should allow arch code to redefine queue_spin_unlock()? And 
define a version without smp_mb() for x86?


What I have been thinking is to set a flag in an architecture specific 
header file to tell if the architecture need a memory barrier. The 
generic code will then either do a smp_mb() or barrier() depending on 
the presence or absence of the flag. I would prefer to do more in the 
generic code, if possible.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-01 Thread Waiman Long

On 10/01/2013 12:48 PM, Tim Chen wrote:

On Mon, 2013-09-30 at 12:36 -0400, Waiman Long wrote:

On 09/30/2013 12:10 PM, Jason Low wrote:

On Mon, 2013-09-30 at 11:51 -0400, Waiman Long wrote:

On 09/28/2013 12:34 AM, Jason Low wrote:

Also, below is what the mcs_spin_lock() and mcs_spin_unlock()
functions would look like after applying the proposed changes.

static noinline
void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
{
   struct mcs_spin_node *prev;

   /* Init node */
   node->locked = 0;
   node->next   = NULL;

   prev = xchg(lock, node);
   if (likely(prev == NULL)) {
   /* Lock acquired. No need to set node->locked since it
won't be used */
   return;
   }
   ACCESS_ONCE(prev->next) = node;
   /* Wait until the lock holder passes the lock down */
   while (!ACCESS_ONCE(node->locked))
   arch_mutex_cpu_relax();
   smp_mb();

I wonder if a memory barrier is really needed here.

If the compiler can reorder the while (!ACCESS_ONCE(node->locked)) check
so that the check occurs after an instruction in the critical section,
then the barrier may be necessary.


In that case, just a barrier() call should be enough.

The cpu could still be executing out of order load instruction from the
critical section before checking node->locked?  Probably smp_mb() is
still needed.

Tim


But this is the lock function, a barrier() call should be enough to 
prevent the critical section from creeping up there. We certainly need 
some kind of memory barrier at the end of the unlock function.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rwsem: reduce spinlock contention in wakeup code path

2013-10-01 Thread Waiman Long

On 10/01/2013 03:33 AM, Ingo Molnar wrote:

* Waiman Long  wrote:


I think Waiman's patches (even the later ones) made the queued rwlocks
be a side-by-side implementation with the old rwlocks, and I think
that was just being unnecessarily careful. It might be useful for
testing to have a config option to switch between the two, but we
might as well go all the way.

It is not actually a side-by-side implementation. A user can choose
either regular rwlock or the queue one, but never both by setting a
configuration parameter. However, I now think that maybe we should do it
the lockref way by pre-determining it on a per-architecture level
without user visible configuration option.

Well, as I pointed it out to you during review, such a Kconfig driven
locking API choice is a no-go!

What I suggested instead: there's absolutely no problem with providing a
better rwlock_t implementation, backed with numbers and all that.

Thanks,

Ingo


Yes, this is what I am planning to do. The next version of my qrwlock 
patch will force the switch to queue rwlock for x86 architecture. The 
other architectures have to be done separately.


-Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-01 Thread Waiman Long

On 10/01/2013 05:16 PM, Tim Chen wrote:

On Tue, 2013-10-01 at 16:01 -0400, Waiman Long wrote:


The cpu could still be executing out of order load instruction from the
critical section before checking node->locked?  Probably smp_mb() is
still needed.

Tim

But this is the lock function, a barrier() call should be enough to
prevent the critical section from creeping up there. We certainly need
some kind of memory barrier at the end of the unlock function.

I may be missing something.  My understanding is that barrier only
prevents the compiler from rearranging instructions, but not for cpu out
of order execution (as in smp_mb). So cpu could read memory in the next
critical section, before node->locked is true, (i.e. unlock has been
completed).  If we only have a simple barrier at end of mcs_lock, then
say the code on CPU1 is

mcs_lock
x = 1;
...
x = 2;
mcs_unlock

and CPU 2 is

mcs_lock
y = x;
...
mcs_unlock

We expect y to be 2 after the "y = x" assignment.  But we
we may execute the code as

CPU1CPU2

x = 1;
... y = x;  ( y=1, out of order load)
x = 2
mcs_unlock
Check node->locked==true
continue executing critical section (y=1 when we expect 
y=2)

So we get y to be 1 when we expect that it should be 2.  Adding smp_mb
after the node->locked check in lock code

ACCESS_ONCE(prev->next) = node;
/* Wait until the lock holder passes the lock down */
while (!ACCESS_ONCE(node->locked))
 arch_mutex_cpu_relax();
smp_mb();

should prevent this scenario.

Thanks.
Tim


If the lock and unlock functions are done right, there should be no 
overlap of critical section. So it is job of the lock/unlock functions 
to make sure that critical section code won't leak out. There should be 
some kind of memory barrier at the beginning of the lock function and 
the end of the unlock function.


The critical section also likely to have branches. The CPU may 
speculatively execute code on the 2 branches, but one of them will be 
discarded once the branch condition is known. Also 
arch_mutex_cpu_relax() is a compiler barrier by itself. So we may not 
need a barrier() after all. The while statement is a branch instruction, 
any code after that can only be speculatively executed and cannot be 
committed until the branch is done.


In x86, the smp_mb() function translated to a mfence instruction which 
cost time. That is why I try to get rid of it if it is not necessary.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 2/3] qrwlock x86: Enable x86 to use queue read/write lock

2013-10-02 Thread Waiman Long
This patch makes the necessary changes at the x86 architecture specific
layer to enable the presence of the CONFIG_QUEUE_RWLOCK kernel option
to replace the read/write lock by the queue read/write lock.

It also enables CONFIG_ARCH_QUEUE_RWLOCK which will force the use
of queue read/write lock for x86 which tends to have the largest
NUMA machines compared with the other architectures. This patch will
improve the scalability of those large machines.

Signed-off-by: Waiman Long 
---
 arch/x86/Kconfig  |1 +
 arch/x86/include/asm/spinlock.h   |2 ++
 arch/x86/include/asm/spinlock_types.h |4 
 3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ee2fb9d..14b4dca 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -123,6 +123,7 @@ config X86
select COMPAT_OLD_SIGACTION if IA32_EMULATION
select RTC_LIB
select HAVE_DEBUG_STACKOVERFLOW
+   select QUEUE_RWLOCK
 
 config INSTRUCTION_DECODER
def_bool y
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index bf156de..8fb88c5 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -188,6 +188,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t 
*lock)
cpu_relax();
 }
 
+#ifndef CONFIG_QUEUE_RWLOCK
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -270,6 +271,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
 : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
 }
+#endif /* CONFIG_QUEUE_RWLOCK */
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/x86/include/asm/spinlock_types.h 
b/arch/x86/include/asm/spinlock_types.h
index 4f1bea1..a585635 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -34,6 +34,10 @@ typedef struct arch_spinlock {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED  { { 0 } }
 
+#ifdef CONFIG_QUEUE_RWLOCK
+#include 
+#else
 #include 
+#endif
 
 #endif /* _ASM_X86_SPINLOCK_TYPES_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 0/3] qrwlock: Introducing a queue read/write lock implementation

2013-10-02 Thread Waiman Long
v3->v4:
 - Optimize the fast path with better cold cache behavior and
   performance.
 - Removing some testing code.
 - Make x86 use queue rwlock with no user configuration.

v2->v3:
 - Make read lock stealing the default and fair rwlock an option with
   a different initializer.
 - In queue_read_lock_slowpath(), check irq_count() and force spinning
   and lock stealing in interrupt context.
 - Unify the fair and classic read-side code path, and make write-side
   to use cmpxchg with 2 different writer states. This slows down the
   write lock fastpath to make the read side more efficient, but is
   still slightly faster than a spinlock.

v1->v2:
 - Improve lock fastpath performance.
 - Optionally provide classic read/write lock behavior for backward
   compatibility.
 - Use xadd instead of cmpxchg for fair reader code path to make it
   immute to reader contention.
 - Run more performance testing.

As mentioned in the LWN article http://lwn.net/Articles/364583/,
the read/write lock suffer from an unfairness problem that it is
possible for a stream of incoming readers to block a waiting writer
from getting the lock for a long time. Also, a waiting reader/writer
contending a rwlock in local memory will have a higher chance of
acquiring the lock than a reader/writer in remote node.

This patch set introduces a queue-based read/write lock implementation
that can largely solve this unfairness problem if the lock owners
choose to use the fair variant of the lock.

The queue rwlock has two variants selected at initialization time
- unfair (with read lock stealing) and fair (to both readers and
writers). The unfair rwlock is the default.

The read lock slowpath will check if the reader is in an interrupt
context. If so, it will force lock spinning and stealing without
waiting in a queue. This is to ensure the read lock will be granted
as soon as possible.

Even the unfair rwlock is fairer than the current version as there
is a higher chance for writers to get the lock and is fair among
the writers.

The queue write lock can also be used as a replacement for ticket
spinlocks that are highly contended if lock size increase is not
an issue.

This patch set currently provides queue read/write lock support on
x86 architecture only. Support for other architectures can be added
later on once architecture specific support infrastructure is added
and proper testing is done.

Signed-off-by: Waiman Long 

Waiman Long (3):
  qrwlock: A queue read/write lock implementation
  qrwlock x86: Enable x86 to use queue read/write lock
  qrwlock: Enable fair queue read/write lock

 arch/x86/Kconfig  |1 +
 arch/x86/include/asm/spinlock.h   |2 +
 arch/x86/include/asm/spinlock_types.h |4 +
 include/asm-generic/qrwlock.h |  256 +
 include/linux/rwlock.h|   15 ++
 include/linux/rwlock_types.h  |   13 ++
 kernel/Kconfig.locks  |7 +
 lib/Makefile  |1 +
 lib/qrwlock.c |  247 +++
 lib/spinlock_debug.c  |   19 +++
 10 files changed, 565 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-generic/qrwlock.h
 create mode 100644 lib/qrwlock.c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 3/3] qrwlock: Enable fair queue read/write lock

2013-10-02 Thread Waiman Long
By default, queue rwlock is fair among writers and gives preference
to readers allowing them to steal lock even if a writer is
waiting. However, there is a desire to have a fair variant of
rwlock that is more deterministic. To enable this, fair variants
of lock initializers are added by this patch to allow lock owners
to choose to use fair rwlock. These newly added initializers all
have the _fair or _FAIR suffix to indicate the desire to use a fair
rwlock. If the QUEUE_RWLOCK config option is not selected, the fair
rwlock initializers will be the same as the regular ones.

Signed-off-by: Waiman Long 
---
 include/linux/rwlock.h   |   15 +++
 include/linux/rwlock_types.h |   13 +
 lib/spinlock_debug.c |   19 +++
 3 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index bc2994e..5f2628b 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -23,9 +23,24 @@ do { 
\
\
__rwlock_init((lock), #lock, &__key);   \
 } while (0)
+
+# ifdef CONFIG_QUEUE_RWLOCK
+extern void __rwlock_init_fair(rwlock_t *lock, const char *name,
+  struct lock_class_key *key);
+#  define rwlock_init_fair(lock)   \
+do {   \
+   static struct lock_class_key __key; \
+   \
+   __rwlock_init_fair((lock), #lock, &__key);  \
+} while (0)
+# else
+#  define __rwlock_init_fair(l, n, k)  __rwlock_init(l, n, k)
+# endif /* CONFIG_QUEUE_RWLOCK */
 #else
 # define rwlock_init(lock) \
do { *(lock) = __RW_LOCK_UNLOCKED(lock); } while (0)
+# define rwlock_init_fair(lock)\
+   do { *(lock) = __RW_LOCK_UNLOCKED_FAIR(lock); } while (0)
 #endif
 
 #ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
index cc0072e..d27c812 100644
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -37,12 +37,25 @@ typedef struct {
.owner = SPINLOCK_OWNER_INIT,   \
.owner_cpu = -1,\
RW_DEP_MAP_INIT(lockname) }
+#define __RW_LOCK_UNLOCKED_FAIR(lockname)  \
+   (rwlock_t)  {   .raw_lock = __ARCH_RW_LOCK_UNLOCKED_FAIR,\
+   .magic = RWLOCK_MAGIC,  \
+   .owner = SPINLOCK_OWNER_INIT,   \
+   .owner_cpu = -1,\
+   RW_DEP_MAP_INIT(lockname) }
 #else
 #define __RW_LOCK_UNLOCKED(lockname) \
(rwlock_t)  {   .raw_lock = __ARCH_RW_LOCK_UNLOCKED,\
RW_DEP_MAP_INIT(lockname) }
+#define __RW_LOCK_UNLOCKED_FAIR(lockname) \
+   (rwlock_t)  {   .raw_lock = __ARCH_RW_LOCK_UNLOCKED_FAIR,\
+   RW_DEP_MAP_INIT(lockname) }
 #endif
 
 #define DEFINE_RWLOCK(x)   rwlock_t x = __RW_LOCK_UNLOCKED(x)
+#define DEFINE_RWLOCK_FAIR(x)  rwlock_t x = __RW_LOCK_UNLOCKED_FAIR(x)
 
+#ifndef__ARCH_RW_LOCK_UNLOCKED_FAIR
+#define__ARCH_RW_LOCK_UNLOCKED_FAIR__ARCH_RW_LOCK_UNLOCKED
+#endif
 #endif /* __LINUX_RWLOCK_TYPES_H */
diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c
index 0374a59..d6ef7ce 100644
--- a/lib/spinlock_debug.c
+++ b/lib/spinlock_debug.c
@@ -49,6 +49,25 @@ void __rwlock_init(rwlock_t *lock, const char *name,
 
 EXPORT_SYMBOL(__rwlock_init);
 
+#ifdef CONFIG_QUEUE_RWLOCK
+void __rwlock_init_fair(rwlock_t *lock, const char *name,
+   struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+   /*
+* Make sure we are not reinitializing a held lock:
+*/
+   debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+   lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+   lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED_FAIR;
+   lock->magic = RWLOCK_MAGIC;
+   lock->owner = SPINLOCK_OWNER_INIT;
+   lock->owner_cpu = -1;
+}
+EXPORT_SYMBOL(__rwlock_init_fair);
+#endif /* CONFIG_QUEUE_RWLOCK */
+
 static void spin_dump(raw_spinlock_t *lock, const char *msg)
 {
struct task_struct *owner = NULL;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 1/3] qrwlock: A queue read/write lock implementation

2013-10-02 Thread Waiman Long
pin_lock
   1.04%   reaim  [kernel.kallsyms]  [k] mutex_spin_on_owner

Perf profile of kernel (3):

  10.57%   reaim  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
   7.98%   reaim  [kernel.kallsyms]  [k] queue_write_lock_slowpath
   5.83%   reaim  [kernel.kallsyms]  [k] mspin_lock
   2.86%  ls  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
   2.71%   reaim  [kernel.kallsyms]  [k] anon_vma_interval_tree_insert
   1.52%true  [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
   1.51%   reaim  [kernel.kallsyms]  [k] queue_read_lock_slowpath
   1.35%   reaim  [kernel.kallsyms]  [k] mutex_spin_on_owner
   1.12%   reaim  [kernel.kallsyms]  [k] zap_pte_range
   1.06%   reaim  [kernel.kallsyms]  [k] perf_event_aux_ctx
   1.01%   reaim  [kernel.kallsyms]  [k] perf_event_aux

Tim Chen also tested the qrwlock with Ingo's patch on a 4-socket
machine.  It was found the performance improvement of 11% was the
same with regular rwlock or queue rwlock.

Signed-off-by: Waiman Long 
---
 include/asm-generic/qrwlock.h |  256 +
 kernel/Kconfig.locks  |7 +
 lib/Makefile  |1 +
 lib/qrwlock.c |  247 +++
 4 files changed, 511 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-generic/qrwlock.h
 create mode 100644 lib/qrwlock.c

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
new file mode 100644
index 000..e94c69c
--- /dev/null
+++ b/include/asm-generic/qrwlock.h
@@ -0,0 +1,256 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long 
+ */
+#ifndef __ASM_GENERIC_QRWLOCK_H
+#define __ASM_GENERIC_QRWLOCK_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
+#error "Missing either LITTLE_ENDIAN or BIG_ENDIAN definition."
+#endif
+
+#if (CONFIG_NR_CPUS < 65536)
+typedef u16 __nrcpu_t;
+typedef u32 __nrcpupair_t;
+#defineQRW_READER_BIAS (1U << 16)
+#else
+typedef u32 __nrcpu_t;
+typedef u64 __nrcpupair_t;
+#defineQRW_READER_BIAS (1UL << 32)
+#endif
+
+/*
+ * The queue read/write lock data structure
+ *
+ * Read lock stealing can only happen when there is at least one reader
+ * holding the read lock. When the fair flag is not set, it mimics the
+ * behavior of the regular rwlock at the expense that a perpetual stream
+ * of readers could starve a writer for a long period of time. That
+ * behavior, however, may be beneficial to a workload that is reader heavy
+ * with slow writers, and the writers can wait without undesirable consequence.
+ * This fair flag should only be set at initialization time.
+ *
+ * The layout of the structure is endian-sensitive to make sure that adding
+ * QRW_READER_BIAS to the rw field to increment the reader count won't
+ * disturb the writer and the fair fields.
+ */
+struct qrwnode {
+   struct qrwnode *next;
+   boolwait;   /* Waiting flag */
+};
+
+typedef struct qrwlock {
+   union qrwcnts {
+   struct {
+#ifdef __LITTLE_ENDIAN
+   u8writer;   /* Writer state */
+   u8fair; /* Fair rwlock flag */
+   __nrcpu_t readers;  /* # of active readers  */
+#else
+   __nrcpu_t readers;  /* # of active readers  */
+   u8fair; /* Fair rwlock flag */
+   u8writer;   /* Writer state */
+#endif
+   };
+   __nrcpupair_t rw;   /* Reader/writer number pair */
+   } cnts;
+   struct qrwnode *waitq;  /* Tail of waiting queue */
+} arch_rwlock_t;
+
+/*
+ * Writer state values & mask
+ */
+#defineQW_WAITING  1   /* A writer is waiting  
   */
+#defineQW_LOCKED   0xff/* A writer holds the 
lock */
+#define QW_MASK_FAIR   ((u8)~QW_WAITING)   /* Mask for fair reader*/
+#define QW_MASK_UNFAIR ((u8)~0)/* Mask for unfair reader  */
+
+/*
+ * External function declarations
+ */
+extern void queue_read_lock_slowpath(struct qrwlock *lock);
+extern void queue_write_lock_slowpath(struct qrwlock *lock);
+
+/**
+ * queue_read_can_lock- would read_trylock() succe

Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-02 Thread Waiman Long

On 09/26/2013 06:42 PM, Jason Low wrote:

On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:

Okay, that would makes sense for consistency because we always
first set node->lock = 0 at the top of the function.

If we prefer to optimize this a bit though, perhaps we can
first move the node->lock = 0 so that it gets executed after the
"if (likely(prev == NULL)) {}" code block and then delete
"node->lock = 1" inside the code block.

static noinline
void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node *node)
{
struct mcs_spin_node *prev;

/* Init node */
node->next   = NULL;

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
return;
}
node->locked = 0;


You can remove the locked flag setting statement inside if (prev == 
NULL), but you can't clear the locked flag after xchg(). In the interval 
between xchg() and locked=0, the previous lock owner may come in and set 
the flag. Now if your clear it, the thread will loop forever. You have 
to clear it before xchg().


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-02 Thread Waiman Long

On 10/02/2013 02:43 PM, Tim Chen wrote:

On Tue, 2013-10-01 at 21:25 -0400, Waiman Long wrote:


If the lock and unlock functions are done right, there should be no
overlap of critical section. So it is job of the lock/unlock functions
to make sure that critical section code won't leak out. There should be
some kind of memory barrier at the beginning of the lock function and
the end of the unlock function.

The critical section also likely to have branches. The CPU may
speculatively execute code on the 2 branches, but one of them will be
discarded once the branch condition is known. Also
arch_mutex_cpu_relax() is a compiler barrier by itself. So we may not
need a barrier() after all. The while statement is a branch instruction,
any code after that can only be speculatively executed and cannot be
committed until the branch is done.

But the condition code may be checked after speculative execution?
The condition may not be true during speculative execution and only
turns true when we check the condition, and take that branch?

The thing that bothers me is without memory barrier after the while
statement, we could speculatively execute before affirming the lock is
in acquired state. Then when we check the lock, the lock is set
to acquired state in the mean time.
We could be loading some memory entry *before*
the node->locked has been set true.  I think a smp_rmb (if not a
smp_mb) should be set after the while statement.


Yes, I think a smp_rmb() make sense here to correspond to the smp_wmb() 
in the unlock path.


BTW, you need to move the node->locked = 0; statement before xchg() if 
you haven't done so.


-Longman


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 5/6] MCS Lock: Restructure the MCS lock defines and locking code into its own file

2013-10-02 Thread Waiman Long

On 10/02/2013 03:30 PM, Jason Low wrote:

On Wed, Oct 2, 2013 at 12:19 PM, Waiman Long  wrote:

On 09/26/2013 06:42 PM, Jason Low wrote:

On Thu, 2013-09-26 at 14:41 -0700, Tim Chen wrote:

Okay, that would makes sense for consistency because we always
first set node->lock = 0 at the top of the function.

If we prefer to optimize this a bit though, perhaps we can
first move the node->lock = 0 so that it gets executed after the
"if (likely(prev == NULL)) {}" code block and then delete
"node->lock = 1" inside the code block.

static noinline
void mcs_spin_lock(struct mcs_spin_node **lock, struct mcs_spin_node
*node)
{
 struct mcs_spin_node *prev;

 /* Init node */
 node->next   = NULL;

 prev = xchg(lock, node);
 if (likely(prev == NULL)) {
 /* Lock acquired */
 return;
 }
 node->locked = 0;


You can remove the locked flag setting statement inside if (prev == NULL),
but you can't clear the locked flag after xchg(). In the interval between
xchg() and locked=0, the previous lock owner may come in and set the flag.
Now if your clear it, the thread will loop forever. You have to clear it
before xchg().

Yes, in my most recent version, I left locked = 0 in its original
place so that the xchg() can act as a barrier for it.

The other option would have been to put another barrier after locked =
0. I went with leaving locked = 0 in its original place so that we
don't need that extra barrier.


I don't think putting another barrier after locked=0 will work. 
Chronologically, the flag must be cleared before the node address is 
saved in the lock field. There is no way to guarantee that except by 
putting the locked=0 before xchg().


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 01/12] spinlock: A new lockref structure for lockless update of refcount

2013-07-08 Thread Waiman Long

On 07/05/2013 02:59 PM, Thomas Gleixner wrote:

On Fri, 5 Jul 2013, Waiman Long wrote:

+ * If the spinlock&  reference count optimization feature is disabled,
+ * the spinlock and reference count are accessed separately on its own.
+ */
+struct lockref {
+   unsigned int refcnt;/* Reference count */
+   spinlock_t   lock;
+};
+
+/*
+ * Struct lockref helper functions
+ */
+/*

Function documentation starts with /**


I will fix that.


+ * lockref_get - increments reference count while not locked

This should be: Increments reference count unconditionally.


Will change that.


+ * @lockcnt: pointer to lockref structure
+ */
+static __always_inline void
+lockref_get(struct lockref *lockcnt)

Please avoid these line breaks if the line fits in 80 chars.


Will make the change.


+{
+   spin_lock(&lockcnt->lock);
+   lockcnt->refcnt++;
+   spin_unlock(&lockcnt->lock);
+}
+/*
+ * lockref_put_or_locked - decrements count unless count<= 1 before decrement
+ *otherwise the lock will be taken

   lockref_put_or_lock please

Docbook cannot work with multiline comments for the function name.

So make that shorter and add a longer explanation below the @argument
docs.


Will fix that.


+ * @lockcnt: pointer to lockref structure
+ * Return: 1 if count updated successfully or 0 if count<= 1 and lock taken
+ */
+static __always_inline int
+lockref_put_or_locked(struct lockref *lockcnt)
+{
+   spin_lock(&lockcnt->lock);
+   if (likely(lockcnt->refcnt>  1)) {
+   lockcnt->refcnt--;
+   spin_unlock(&lockcnt->lock);
+   return 1;
+   }
+   return 0;   /* Count is 1&  lock taken */

Please no tail comments. They are horrible to parse and it's obvious
from the code.


Will remove the tail comments.


+}
+
+#endif /* CONFIG_SPINLOCK_REFCOUNT */
+#endif /* __LINUX_SPINLOCK_REFCOUNT_H */
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d1..d1f8670 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,8 @@ endif
  config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP&&  !DEBUG_MUTEXES
+
+config SPINLOCK_REFCOUNT
+   def_bool y
+   depends on ARCH_SPINLOCK_REFCOUNT&&  SMP

This looks wrong. We want three options:

1) Always take the lock
2) Use the generic implementation
3) Use an architecture specific implementation

So we want

config GENERIC_SPINLOCK_REFCOUNT
  bool

config ARCH_SPINLOCK_REFCOUNT
  bool

config SPINLOCK_REFCOUNT
  def_bool y
  depends on GENERIC_SPINLOCK_REFCOUNT || ARCH_SPINLOCK_REFCOUNT
  depends on .

So an architectire can select GENERIC_SPINLOCK_REFCOUNT to get the
generic implementation or ARCH_SPINLOCK_REFCOUNT if it provides its
own special version.

And lib/spinlock_refcount.o depends on GENERIC_SPINLOCK_REFCOUNT


I think it is OK to add the GENERIC option, but I would like to make 
available a slightly different set of options:

1) Always take the lock
2) Use the generic implementation with the default parameters
3) Use the generic implementation with a customized set of parameters
4) Use an architecture specific implementation.

2) set only GENERIC_SPINLOCK_REFCOUNT
3) set both GENERIC_SPINLOCK_REFCOUNT and ARCH_SPINLOCK_REFCOUNT
4) set only ARCH_SPINLOCK_REFCOUNT

The customized parameters will be set in the "asm/spinlock_refcount.h" 
file. Currently ,there is 2 parameters that can be customized for each 
architecture:

1) How much time will the function wait until the lock is free
2) How many attempts to do a lockless cmpxchg to update reference count


+/*
+ * The maximum number of waiting spins when the lock was acquiring before
+ * trying to attempt a lockless update. The purpose of the timeout is to
+ * limit the amount of unfairness to the thread that is doing the reference
+ * count update. Otherwise, it is theoretically possible for that thread to
+ * be starved for a really long time causing all kind of problems. If it
+ * times out and the lock is still not free, the code will fall back to the
+ * traditional way of queuing up to acquire the lock before updating the count.
+ *
+ * The actual time spent in the wait loop very much depends on the CPU being
+ * used. On a 2.4GHz Westmere CPU, the execution time of a PAUSE instruction
+ * (cpu_relax) in a 4k loop is about 16us. The lock checking and looping
+ * overhead is about 8us. With 4 cpu_relax's in the loop, it will wait
+ * about 72us before the count reaches 0. With cacheline contention, the
+ * wait time can go up to 3x as much (about 210us). Having multiple
+ * cpu_relax's in the wait loop does seem to reduce cacheline contention
+ * somewhat and give slightly better performance.
+ *
+ * The preset timeout value is rather arbitrary and really depends on the CPU
+ * being used. If customization i

Re: [PATCH v5 00/12] Lockless update of reference count protected by spinlock

2013-07-08 Thread Waiman Long

On 07/05/2013 04:33 PM, Thomas Gleixner wrote:

On Fri, 5 Jul 2013, Waiman Long wrote:

patch 1:Introduce the new lockref data structure
patch 2:Enable x86 architecture to use the feature
patch 3:Rename all d_count references to d_refcount

And after that the mail threading does not work anymore, because you
managed to lose the

In-Reply-To:<1373035656-40600-1-git-send-email-waiman.l...@hp.com>
References:<1373035656-40600-1-git-send-email-waiman.l...@hp.com>

tags in patches 4-12


Thank for pointing this out. I will change the way I send the patch to 
fix this issue in the next revision of the patch.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2 v5] SELinux: Reduce overhead of mls_level_isvalid() function call

2013-07-08 Thread Waiman Long

On 07/08/2013 12:30 PM, Paul Moore wrote:

On Friday, July 05, 2013 01:10:32 PM Waiman Long wrote:

On 06/11/2013 07:49 AM, Stephen Smalley wrote:

On 06/10/2013 01:55 PM, Waiman Long wrote:

...


Signed-off-by: Waiman Long

Acked-by:  Stephen Smalley

Thank for the Ack. Will that patch go into v3.11?

[NOTE: I add the SELinux list to the CC line, for future reference, be sure to
send your SELinux patches there.]

Your patch looked reasonable to me and Stephen ACK'd it so I went ahead and
pulled the 1/2 patch into my lblnet-next tree.  It is probably an abuse of the
system, but as you noted it in the description, it does have an impact on
socket creation so it isn't completely unrelated ;)

If you don't want me to include your patch let me know and I'll drop it.


Sure. I would like to have my patch included. Thank for letting me know.

Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 11/14] nilfs2: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
Acked-by: Ryusuke Konishi 
---
 fs/nilfs2/super.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1427de5..af3ba04 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -996,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, 
__u64 cno,
 
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-   return root_dentry->d_count > 1;
+   return d_count(root_dentry) > 1;
 }
 
 /**
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 02/14] spinlock: Enable x86 architecture to do lockless refcount update

2013-07-08 Thread Waiman Long
This patch enables the x86 architecture to do lockless reference
count update using the generic lockref implementation with default
parameters. Only the x86/Kconfig file needs to be changed.

Signed-off-by: Waiman Long 
---
 arch/x86/Kconfig |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 265c672..6a86061 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -261,6 +261,9 @@ config ARCH_CPU_PROBE_RELEASE
 config ARCH_SUPPORTS_UPROBES
def_bool y
 
+config GENERIC_SPINLOCK_REFCOUNT
+   def_bool y
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 01/14] spinlock: A new lockref structure for lockless update of refcount

2013-07-08 Thread Waiman Long
This patch introduces a new set of spinlock_refcount.h header files to
be included by kernel codes that want to do a faster lockless update
of reference count protected by a spinlock.

The new lockref structure consists of just the spinlock and the
reference count data. Helper functions are defined in the new
 header file to access the content of
the new structure. There is a generic structure defined for all
architecture, but each architecture can also optionally define its
own structure and use its own helper functions.

Three new config parameters are introduced:
1. SPINLOCK_REFCOUNT
2. GENERIC_SPINLOCK_REFCOUNT
2. ARCH_SPINLOCK_REFCOUNT

The first one is defined in the kernel/Kconfig.locks which is used
to enable or disable the faster lockless reference count update
optimization. The second and third one have to be defined in each of
the architecture's Kconfig file to enable the optimization for that
architecture. Therefore, each architecture has to opt-in for this
optimization or it won't get it. This allows each architecture plenty
of time to test it out before deciding to use it or replace it with
a better architecture specific solution. The architecture should set
only GENERIC_SPINLOCK_REFCOUNT to use the generic implementation
without customization. By setting only ARCH_SPINLOCK_REFCOUNT,
the architecture will have to provide its own implementation. By
setting both, an architecture uses the generic implementation with
customized parameters.

This optimization won't work for non-SMP system or when spinlock
debugging is turned on. As a result, it is turned off each any of
them is true. It also won't work for full preempt-RT and so should
be turned off in this case.

To maximize the chance of doing lockless update in the generic version,
the inlined __lockref_add_unless() function will wait for a certain
amount of time if the lock is not free before trying to do the update.
The amount of time is controlled by the LOCKREF_WAIT_SHIFT macro.

The new code also attempts to do lockless atomic update a few
time before falling back to the old code path of acquiring a lock
before doing the update. Similarly, this is controlled by the
LOCKREF_RETRY_COUNT macro.

Signed-off-by: Waiman Long 
---
 include/asm-generic/spinlock_refcount.h |   46 +++
 include/linux/spinlock_refcount.h   |  142 
 kernel/Kconfig.locks|   15 ++
 lib/Makefile|2 +
 lib/spinlock_refcount.c |  218 +++
 5 files changed, 423 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-generic/spinlock_refcount.h
 create mode 100644 include/linux/spinlock_refcount.h
 create mode 100644 lib/spinlock_refcount.c

diff --git a/include/asm-generic/spinlock_refcount.h 
b/include/asm-generic/spinlock_refcount.h
new file mode 100644
index 000..d3a4119
--- /dev/null
+++ b/include/asm-generic/spinlock_refcount.h
@@ -0,0 +1,46 @@
+/*
+ * Spinlock with reference count combo
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (c) Copyright 2013 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long 
+ */
+#ifndef __ASM_GENERIC_SPINLOCK_REFCOUNT_H
+#define __ASM_GENERIC_SPINLOCK_REFCOUNT_H
+
+/*
+ * The lockref structure defines a combined spinlock with reference count
+ * data structure to be embedded in a larger structure. The combined data
+ * structure is always 8-byte aligned. So proper placement of this structure
+ * in the larger embedding data structure is needed to ensure that there is
+ * no hole in it.
+ */
+struct __aligned(sizeof(u64)) lockref {
+   union {
+   u64 lock_count;
+   struct {
+   unsigned intrefcnt; /* Reference count */
+   spinlock_t  lock;
+   };
+   };
+};
+
+/*
+ * Struct lockref helper functions
+ */
+extern void lockref_get(struct lockref *lockcnt);
+extern int  lockref_put(struct lockref *lockcnt);
+extern int  lockref_get_not_zero(struct lockref *lockcnt);
+extern int  lockref_put_or_lock(struct lockref *lockcnt);
+
+#endif /* __ASM_GENERIC_SPINLOCK_REFCOUNT_H */
diff --git a/include/linux/spinlock_refcount.h 
b/include/linux/spinlock_refcount.h
new file mode 100644
index 000..32389a9
--- /dev/null
+++ b/include/linux/spinlock_refcount.h
@@ -0,0 +1,142 @@
+/*
+ * Spinlock with reference count combo data structure
+ *
+ * This program is free software; you can redistribute it and/or mod

[PATCH v6 14/14] dcache: Enable lockless update of refcount in dentry structure

2013-07-08 Thread Waiman Long
e are some abnormalities in the original 3.10 2-node data. Ignoring
that, the performance difference for the other node counts, if any,
is insignificant.

A perf call-graph report of the short workload at 1500 users
without the patch on the same 8-node machine indicates that about
78% of the workload's total time were spent in the _raw_spin_lock()
function. Almost all of which can be attributed to the following 2
kernel functions:
 1. dget_parent (49.91%)
 2. dput (49.89%)

The relevant perf report lines are:
+  78.37%reaim  [kernel.kallsyms] [k] _raw_spin_lock
+   0.09%reaim  [kernel.kallsyms] [k] dput
+   0.05%reaim  [kernel.kallsyms] [k] _raw_spin_lock_irq
+   0.00%reaim  [kernel.kallsyms] [k] dget_parent

With this patch installed, the new perf report lines are:
+  19.65%reaim  [kernel.kallsyms] [k] _raw_spin_lock_irqsave
+   3.94%reaim  [kernel.kallsyms] [k] _raw_spin_lock
+   2.47%reaim  [kernel.kallsyms] [k] lockref_get_not_zero
+   0.62%reaim  [kernel.kallsyms] [k] lockref_put_or_locked
+   0.36%reaim  [kernel.kallsyms] [k] dput
+   0.31%reaim  [kernel.kallsyms] [k] lockref_get
+   0.02%reaim  [kernel.kallsyms] [k] dget_parent

-   3.94%reaim  [kernel.kallsyms] [k] _raw_spin_lock
   - _raw_spin_lock
  + 32.86% SyS_getcwd
  + 31.99% d_path
  + 4.81% prepend_path
  + 4.14% __rcu_process_callbacks
  + 3.73% complete_walk
  + 2.31% dget_parent
  + 1.99% unlazy_walk
  + 1.44% do_anonymous_page
  + 1.22% lockref_put_or_locked
  + 1.16% sem_lock
  + 0.95% task_rq_lock
  + 0.89% selinux_inode_free_security
  + 0.89% process_backlog
  + 0.79% enqueue_to_backlog
  + 0.72% unix_dgram_sendmsg
  + 0.69% unix_stream_sendmsg

The lockref_put_or_locked used up only 1.22% of the _raw_spin_lock
time while dget_parent used only 2.31%.

This impact of this patch on other AIM7 workloads were much more
modest.  The table below show the mean %change due to this patch on
the same 8-socket system with a 3.10 kernel.

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| alltests | -0.2% | +0.5%  | -0.3%   |
| five_sec | +2.5% | -4.2%  | -4.7%   |
| fserver  | +1.7% | +1.6%  | +0.3%   |
| high_systime | +0.1% | +1.4%  | +5.5%   |
| new_fserver  | +0.4% | +1.2%  | +0.3%   |
| shared   | +0.8% | -0.3%  |  0.0%   |
+--+---++-+

There are slight drops in performance for the five_sec workload,
but slight increase in the high_systime workload.

The checkpatch.pl scripts reported errors in the d_lock and d_refcount
macros as it wanted to have parentheses around the actual names.
That won't work for those macros and so the errors should be ignored.

Signed-off-by: Waiman Long 
---
 fs/dcache.c|   18 --
 include/linux/dcache.h |   17 ++---
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 20def64..095ee18 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -515,7 +515,9 @@ void dput(struct dentry *dentry)
 repeat:
if (dentry->d_refcount == 1)
might_sleep();
-   spin_lock(&dentry->d_lock);
+   if (lockref_put_or_lock(&dentry->d_lockcnt))
+   return;
+   /* dentry's lock taken */
BUG_ON(!dentry->d_refcount);
if (dentry->d_refcount > 1) {
dentry->d_refcount--;
@@ -611,26 +613,30 @@ static inline void __dget_dlock(struct dentry *dentry)
 
 static inline void __dget(struct dentry *dentry)
 {
-   spin_lock(&dentry->d_lock);
-   __dget_dlock(dentry);
-   spin_unlock(&dentry->d_lock);
+   lockref_get(&dentry->d_lockcnt);
 }
 
 struct dentry *dget_parent(struct dentry *dentry)
 {
struct dentry *ret;
 
+   rcu_read_lock();
+   ret = rcu_dereference(dentry->d_parent);
+   if (lockref_get_not_zero(&ret->d_lockcnt)) {
+   rcu_read_unlock();
+   return ret;
+   }
 repeat:
/*
 * Don't need rcu_dereference because we re-check it was correct under
 * the lock.
 */
-   rcu_read_lock();
-   ret = dentry->d_parent;
+   ret = ACCESS_ONCE(dentry->d_parent);
spin_lock(&ret->d_lock);
if (unlikely(ret != dentry->d_parent)) {
spin_unlock(&ret->d_lock);
rcu_read_unlock();
+   rcu_read_lock();
goto repeat;
}
rcu_read_unlock(

[PATCH v6 12/14] lustre-fs: Use the standard d_count() helper to access refcount

2013-07-08 Thread Waiman Long
The Lustre FS should use the newly defined d_count() helper function
to access the dentry's reference count instead of defining its own
d_refcount() macro for the same purpose. Since the current lustre
code is marked as broken, no build test was attempted for this change.

Signed-off-by: Waiman Long 
---
 .../lustre/include/linux/lustre_patchless_compat.h |2 --
 drivers/staging/lustre/lustre/include/linux/lvfs.h |2 +-
 drivers/staging/lustre/lustre/llite/dcache.c   |8 
 .../staging/lustre/lustre/llite/llite_internal.h   |4 ++--
 drivers/staging/lustre/lustre/llite/llite_lib.c|2 +-
 drivers/staging/lustre/lustre/llite/namei.c|4 ++--
 drivers/staging/lustre/lustre/lvfs/lvfs_linux.c|4 ++--
 7 files changed, 12 insertions(+), 14 deletions(-)

diff --git 
a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h 
b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index f050808..a8e9c0c 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -60,8 +60,6 @@ truncate_complete_page(struct address_space *mapping, struct 
page *page)
ll_delete_from_page_cache(page);
 }
 
-#  define d_refcount(d) ((d)->d_count)
-
 #ifdef ATTR_OPEN
 # define ATTR_FROM_OPEN ATTR_OPEN
 #else
diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h 
b/drivers/staging/lustre/lustre/include/linux/lvfs.h
index b4db6cb..eb59ac7 100644
--- a/drivers/staging/lustre/lustre/include/linux/lvfs.h
+++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h
@@ -99,7 +99,7 @@ static inline void l_dput(struct dentry *de)
if (!de || IS_ERR(de))
return;
//shrink_dcache_parent(de);
-   LASSERT(d_refcount(de) > 0);
+   LASSERT(d_count(de) > 0);
dput(de);
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c 
b/drivers/staging/lustre/lustre/llite/dcache.c
index 7d6abff..ff0d085 100644
--- a/drivers/staging/lustre/lustre/llite/dcache.c
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -98,7 +98,7 @@ int ll_dcompare(const struct dentry *parent, const struct 
inode *pinode,
 
CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n",
   name->len, name->name, dentry, dentry->d_flags,
-  d_refcount(dentry));
+  d_count(dentry));
 
/* mountpoint is always valid */
if (d_mountpoint((struct dentry *)dentry))
@@ -165,7 +165,7 @@ static int ll_ddelete(const struct dentry *de)
   list_empty(&de->d_subdirs) ? "" : "subdirs");
 
/* kernel >= 2.6.38 last refcount is decreased after this function. */
-   LASSERT(d_refcount(de) == 1);
+   LASSERT(d_count(de) == 1);
 
/* Disable this piece of code temproarily because this is called
 * inside dcache_lock so it's not appropriate to do lots of work
@@ -190,7 +190,7 @@ static int ll_set_dd(struct dentry *de)
 
CDEBUG(D_DENTRY, "ldd on dentry %.*s (%p) parent %p inode %p refc %d\n",
de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode,
-   d_refcount(de));
+   d_count(de));
 
if (de->d_fsdata == NULL) {
struct ll_dentry_data *lld;
@@ -540,7 +540,7 @@ out:
CDEBUG(D_DENTRY, "revalidated dentry %.*s (%p) parent %p "
   "inode %p refc %d\n", de->d_name.len,
   de->d_name.name, de, de->d_parent, de->d_inode,
-  d_refcount(de));
+  d_count(de));
 
ll_set_lock_data(exp, de->d_inode, it, &bits);
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h 
b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 992cd20..5227c5c 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -1529,12 +1529,12 @@ static inline void d_lustre_invalidate(struct dentry 
*dentry, int nested)
 {
CDEBUG(D_DENTRY, "invalidate dentry %.*s (%p) parent %p inode %p "
   "refc %d\n", dentry->d_name.len, dentry->d_name.name, dentry,
-  dentry->d_parent, dentry->d_inode, d_refcount(dentry));
+  dentry->d_parent, dentry->d_inode, d_count(dentry));
 
spin_lock_nested(&dentry->d_lock,
 nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
__d_lustre_invalidate(dentry);
-   if (d_refcount(dentry) == 0)
+   if (d_count(dentry) == 0)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
 }
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c 
b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 2311b20..afae

[PATCH v6 13/14] dcache: rename d_count field of dentry to d_refcount

2013-07-08 Thread Waiman Long
Before converting the d_lock and d_count field of the dentry data
structure to the new lockref structure, we need to consider the
implication of such a change. All current references of d_count and
d_lock have to be changed accordingly.

One way to minimize the changes is to redefine the original field
names as macros to the new names.  For d_lock, it is possible to do
so saving a lot of changes as this name is not used anywhere else in
the kernel. For d_count, this is not possible as this name is used
somewhere else in the kernel for different things.

The dcache.c and namei.c files need to change the reference count
value.  They will be modified to use a different reference count name
"d_refcount" which is unique in the kernel source code.

Signed-off-by: Waiman Long 
---
 fs/dcache.c|   54 
 fs/namei.c |6 ++--
 include/linux/dcache.h |8 +++---
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 87bdb53..20def64 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -54,7 +54,7 @@
  *   - d_flags
  *   - d_name
  *   - d_lru
- *   - d_count
+ *   - d_refcount
  *   - d_unhashed()
  *   - d_parent and d_subdirs
  *   - childrens' d_child and d_parent
@@ -229,7 +229,7 @@ static void __d_free(struct rcu_head *head)
  */
 static void d_free(struct dentry *dentry)
 {
-   BUG_ON(dentry->d_count);
+   BUG_ON(dentry->d_refcount);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@@ -467,7 +467,7 @@ relock:
}
 
if (ref)
-   dentry->d_count--;
+   dentry->d_refcount--;
/*
 * inform the fs via d_prune that this dentry is about to be
 * unhashed and destroyed.
@@ -513,12 +513,12 @@ void dput(struct dentry *dentry)
return;
 
 repeat:
-   if (dentry->d_count == 1)
+   if (dentry->d_refcount == 1)
might_sleep();
spin_lock(&dentry->d_lock);
-   BUG_ON(!dentry->d_count);
-   if (dentry->d_count > 1) {
-   dentry->d_count--;
+   BUG_ON(!dentry->d_refcount);
+   if (dentry->d_refcount > 1) {
+   dentry->d_refcount--;
spin_unlock(&dentry->d_lock);
return;
}
@@ -535,7 +535,7 @@ repeat:
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
 
-   dentry->d_count--;
+   dentry->d_refcount--;
spin_unlock(&dentry->d_lock);
return;
 
@@ -590,7 +590,7 @@ int d_invalidate(struct dentry * dentry)
 * We also need to leave mountpoints alone,
 * directory or not.
 */
-   if (dentry->d_count > 1 && dentry->d_inode) {
+   if (dentry->d_refcount > 1 && dentry->d_inode) {
if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
spin_unlock(&dentry->d_lock);
return -EBUSY;
@@ -606,7 +606,7 @@ EXPORT_SYMBOL(d_invalidate);
 /* This must be called with d_lock held */
 static inline void __dget_dlock(struct dentry *dentry)
 {
-   dentry->d_count++;
+   dentry->d_refcount++;
 }
 
 static inline void __dget(struct dentry *dentry)
@@ -634,8 +634,8 @@ repeat:
goto repeat;
}
rcu_read_unlock();
-   BUG_ON(!ret->d_count);
-   ret->d_count++;
+   BUG_ON(!ret->d_refcount);
+   ret->d_refcount++;
spin_unlock(&ret->d_lock);
return ret;
 }
@@ -718,7 +718,7 @@ restart:
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
spin_lock(&dentry->d_lock);
-   if (!dentry->d_count) {
+   if (!dentry->d_refcount) {
__dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
@@ -734,7 +734,7 @@ EXPORT_SYMBOL(d_prune_aliases);
 
 /*
  * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
+ * Requires dentry->d_lock is held, and dentry->d_refcount == 0.
  * Releases dentry->d_lock.
  *
  * This may fail if locks cannot be acquired no problem, just try again.
@@ -764,8 +764,8 @@ static void try_prune_one_dentry(struct dentry *dentry)
dentry = parent;
while (dentry) {
spin_lock(&dentry->d_lock);
-   if (dentry->d_count > 1) {
-   dentry->d_count--;
+   if (dentry->d_refcount > 1) {
+   dentry->d_refcount--;
spin_unlock(&dentry->

[PATCH v6 10/14] nfs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/nfs/dir.c|6 +++---
 fs/nfs/unlink.c |2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d7ed697..c933bdf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1721,7 +1721,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_ino, dentry->d_name.name);
 
spin_lock(&dentry->d_lock);
-   if (dentry->d_count > 1) {
+   if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
@@ -1866,7 +1866,7 @@ int nfs_rename(struct inode *old_dir, struct dentry 
*old_dentry,
dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-new_dentry->d_count);
+d_count(new_dentry));
 
/*
 * For non-directories, check whether the target is busy and if so,
@@ -1884,7 +1884,7 @@ int nfs_rename(struct inode *old_dir, struct dentry 
*old_dentry,
rehash = new_dentry;
}
 
-   if (new_dentry->d_count > 2) {
+   if (d_count(new_dentry) > 2) {
int err;
 
/* copy the target dentry's name */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f..60395ad 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
-   dentry->d_count);
+   d_count(dentry));
nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
/*
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 09/14] file locking: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/locks.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 04e2c1f..c98e1a1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1454,7 +1454,7 @@ static int generic_add_lease(struct file *filp, long arg, 
struct file_lock **flp
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
goto out;
if ((arg == F_WRLCK)
-   && ((dentry->d_count > 1)
+   && ((d_count(dentry) > 1)
|| (atomic_read(&inode->i_count) > 1)))
goto out;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 03/14] dcache: Add a new helper function d_count() to return refcount

2013-07-08 Thread Waiman Long
This patch adds a new helper function d_count() in dcache.h for
returning the current reference count of the dentry object. It
should be used by all the files outside of the core dcache.c and
namei.c files.

Signed-off-by: Waiman Long 
---
 include/linux/dcache.h |   10 ++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f42dbe1..7c6bbf0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -258,6 +258,16 @@ extern int have_submounts(struct dentry *);
 extern void d_rehash(struct dentry *);
 
 /**
+ * d_count - return the reference count in dentry
+ * @entry: dentry pointer
+ * Returns: current value of reference count
+ */
+static inline unsigned int d_count(struct dentry *entry)
+{
+   return entry->d_count;
+}
+
+/**
  * d_add - add dentry to hash queues
  * @entry: dentry to add
  * @inode: The inode to attach to this dentry
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 08/14] ecrypt-fs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/ecryptfs/inode.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a2f2bb2..67e9b63 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
 
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-   BUG_ON(!lower_dentry->d_count);
+   BUG_ON(!d_count(lower_dentry));
 
ecryptfs_set_dentry_private(dentry, dentry_info);
ecryptfs_set_dentry_lower(dentry, lower_dentry);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 06/14] coda-fs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/coda/dir.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 14a1480..190effc 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -526,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, 
unsigned int flags)
if (cii->c_flags & C_FLUSH) 
coda_flag_inode_children(inode, C_FLUSH);
 
-   if (de->d_count > 1)
+   if (d_count(de) > 1)
/* pretend it's valid, but don't change the flags */
goto out;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 05/14] ceph-fs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/ceph/inode.c  |4 ++--
 fs/ceph/mds_client.c |2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index be0f7e2..bd2289a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -903,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, 
struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
 "inode %p ino %llx.%llx\n",
-dn, dn->d_count,
-realdn, realdn->d_count,
+dn, d_count(dn),
+realdn, d_count(realdn),
 realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 74fd289..99890b0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1553,7 +1553,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
-dentry, dentry->d_count, *base, len, path);
+dentry, d_count(dentry), *base, len, path);
return path;
 }
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 00/14] Lockless update of reference count protected by spinlock

2013-07-08 Thread Waiman Long
v5->v6:
 - Add a new GENERIC_SPINLOCK_REFCOUNT config parameter for using the
   generic implementation.
 - Add two parameters LOCKREF_WAIT_SHIFT and LOCKREF_RETRY_COUNT which
   can be specified differently for each architecture.
 - Update various spinlock_refcount.* files to incorporate review
   comments.
 - Replace reference of d_refcount() macro in Lustre filesystem code in
   the staging tree to use the new d_count() helper function.

v4->v5:
 - Add a d_count() helper for readonly access of reference count and
   change all references to d_count outside of dcache.c, dcache.h
   and namei.c to use d_count().

v3->v4:
 - Replace helper function access to d_lock and d_count by using
   macros to redefine the old d_lock name to the spinlock and new
   d_refcount name to the reference count. This greatly reduces the
   size of this patchset from 25 to 12 and make it easier to review.

v2->v3:
 - Completely revamp the packaging by adding a new lockref data
   structure that combines the spinlock with the reference
   count. Helper functions are also added to manipulate the new data
   structure. That results in modifying over 50 files, but the changes
   were trivial in most of them.
 - Change initial spinlock wait to use a timeout.
 - Force 64-bit alignment of the spinlock & reference count structure.
 - Add a new way to use the combo by using a new union and helper
   functions.

v1->v2:
 - Add one more layer of indirection to LOCK_WITH_REFCOUNT macro.
 - Add __LINUX_SPINLOCK_REFCOUNT_H protection to spinlock_refcount.h.
 - Add some generic get/put macros into spinlock_refcount.h.

This patchset supports a generic mechanism to atomically update
a reference count that is protected by a spinlock without actually
acquiring the lock itself. If the update doesn't succeeed, the caller
will have to acquire the lock and update the reference count in the
the old way.  This will help in situation where there is a lot of
spinlock contention because of frequent reference count update.

The d_lock and d_count fields of the struct dentry in dcache.h was
modified to use the new lockref data structure and the d_lock name
is now a macro to the actual spinlock. The d_count name, however,
cannot be reused as it has collision elsewhere in the kernel. So a
new d_refcount macro is now used for the reference count and files
outside of dcache.c, dcache.h and namei.c are modified to use the
d_count() helper function.

The various patches were applied and built one-by-one to make sure
that there were no broken build.

This patch set causes significant performance improvement in the
short workload of the AIM7 benchmark on a 8-socket x86-64 machine
with 80 cores.

patch 1:Introduce the new lockref data structure
patch 2:Enable x86 architecture to use the feature
patch 3:Introduce the new d_count() helper function
patches 4-11:   Rename all d_count references to d_count() helper
patch 12:   Replace d_refcount() macro to d_count() helper
patch 13:   Rename the d_count field to d_refcount
patch 14:   Change the dentry structure to use the lockref
structure to improve performance for high contention
cases

Thank to Thomas Gleixner, Andi Kleen and Linus for their valuable
input in shaping this patchset.

Signed-off-by: Waiman Long 

Waiman Long (14):
  spinlock: A new lockref structure for lockless update of refcount
  spinlock: Enable x86 architecture to do lockless refcount update
  dcache: Add a new helper function d_count() to return refcount
  auto-fs: replace direct access of d_count with the d_count() helper
  ceph-fs: replace direct access of d_count with the d_count() helper
  coda-fs: replace direct access of d_count with the d_count() helper
  config-fs: replace direct access of d_count with the d_count() helper
  ecrypt-fs: replace direct access of d_count with the d_count() helper
  file locking: replace direct access of d_count with the d_count()
helper
  nfs: replace direct access of d_count with the d_count() helper
  nilfs2: replace direct access of d_count with the d_count() helper
  lustre-fs: Use the standard d_count() helper to access refcount
  dcache: rename d_count field of dentry to d_refcount
  dcache: Enable lockless update of refcount in dentry structure

 arch/x86/Kconfig   |3 +
 .../lustre/include/linux/lustre_patchless_compat.h |2 -
 drivers/staging/lustre/lustre/include/linux/lvfs.h |2 +-
 drivers/staging/lustre/lustre/llite/dcache.c   |8 +-
 .../staging/lustre/lustre/llite/llite_internal.h   |4 +-
 drivers/staging/lustre/lustre/llite/llite_lib.c|2 +-
 drivers/staging/lustre/lustre/llite/namei.c|4 +-
 drivers/staging/lustre/lustre/lvfs/lvfs_linux.c|4 +-
 fs/autofs4/expire.c|8 +-
 fs/autofs4/root.c  |2 +-
 fs/ceph/inode.c  

[PATCH v6 07/14] config-fs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/configfs/dir.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 64e5323..1d75ec5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
 
-   pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+   pr_debug(" o %s removing done (%d)\n", d->d_name.name, d_count(d));
 
dput(parent);
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 04/14] auto-fs: replace direct access of d_count with the d_count() helper

2013-07-08 Thread Waiman Long
All readonly references to d_count outside of the core dcache code
should be changed to use the new d_count() helper as they shouldn't
access its value directly.  There is no change in logic and everything
should just work.

Signed-off-by: Waiman Long 
---
 fs/autofs4/expire.c |8 
 fs/autofs4/root.c   |2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec9..aac0006 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
 
spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
/* Already gone or negative dentry (under construction) - try next */
-   if (q->d_count == 0 || !simple_positive(q)) {
+   if (d_count(q) == 0 || !simple_positive(q)) {
spin_unlock(&q->d_lock);
next = q->d_u.d_child.next;
goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
else
ino_count++;
 
-   if (p->d_count > ino_count) {
+   if (d_count(p) > ino_count) {
top_ino->last_used = jiffies;
dput(p);
return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
if (!exp_leaves) {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
-   if (dentry->d_count > ino_count)
+   if (d_count(dentry) > ino_count)
goto next;
 
if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block 
*sb,
} else {
/* Path walk currently on this dentry? */
ino_count = atomic_read(&ino->count) + 1;
-   if (dentry->d_count > ino_count)
+   if (d_count(dentry) > ino_count)
goto next;
 
expired = autofs4_check_leaves(mnt, dentry, timeout, 
do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ca8e555..78f9b0a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry 
*dentry)
spin_lock(&active->d_lock);
 
/* Already gone? */
-   if (active->d_count == 0)
+   if (d_count(active) == 0)
goto next;
 
qstr = &active->d_name;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 03/14] dcache: Add a new helper function d_count() to return refcount

2013-07-11 Thread Waiman Long

On 07/08/2013 09:09 PM, Waiman Long wrote:

This patch adds a new helper function d_count() in dcache.h for
returning the current reference count of the dentry object. It
should be used by all the files outside of the core dcache.c and
namei.c files.


I want to know people's thought of spinning out patches 3-11 of this 
patch series as the making the d_count() helper the standard way of 
accessing the reference count in dentry outside of dcache.c and namei.c 
should be non-controversial. By merging it first, this will make 
revising this patch series easier and involving less people.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC 2/2] x86 qrwlock: Enable x86 to use queue read/write lock

2013-07-12 Thread Waiman Long
This patch makes the necessary changes at the x86 architecture specific
layer to enable the presence of the CONFIG_QUEUE_RWLOCK kernel option
to replace the plain read/write lock by the queue read/write lock.

Signed-off-by: Waiman Long 
---
 arch/x86/Kconfig  |3 +++
 arch/x86/include/asm/spinlock.h   |2 ++
 arch/x86/include/asm/spinlock_types.h |4 
 3 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b32ebf9..638dbaa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2344,6 +2344,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
 
+config ARCH_QUEUE_RWLOCK
+   def_bool y
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 33692ea..613a4ff 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -137,6 +137,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t 
*lock)
cpu_relax();
 }
 
+#ifndef CONFIG_QUEUE_RWLOCK
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -219,6 +220,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
 : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
 }
+#endif /* CONFIG_QUEUE_RWLOCK */
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
diff --git a/arch/x86/include/asm/spinlock_types.h 
b/arch/x86/include/asm/spinlock_types.h
index ad0ad07..afacd36 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -28,6 +28,10 @@ typedef struct arch_spinlock {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED  { { 0 } }
 
+#ifdef CONFIG_QUEUE_RWLOCK
+#include 
+#else
 #include 
+#endif
 
 #endif /* _ASM_X86_SPINLOCK_TYPES_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC 0/2] qrwlock: Introducing a queue read/write lock implementation

2013-07-12 Thread Waiman Long
This patch set introduces a queue-based read/write implementation that
is both faster and fairer than the current read/write lock. It can
also be used as a replacement for ticket spinlocks that are highly
contended if lock size increase is not an issue.

There is no change in the interface. By just replacing the current
read/write lock with the queue read/write lock, we can have a faster
and more deterministic system.

Signed-off-by: Waiman Long 

Waiman Long (2):
  qrwlock: A queue read/write lock implementation
  x86 qrwlock: Enable x86 to use queue read/write lock

 arch/x86/Kconfig  |3 +
 arch/x86/include/asm/spinlock.h   |2 +
 arch/x86/include/asm/spinlock_types.h |4 +
 include/asm-generic/qrwlock.h |  124 +
 lib/Kconfig   |   11 ++
 lib/Makefile  |1 +
 lib/qrwlock.c |  246 +
 7 files changed, 391 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-generic/qrwlock.h
 create mode 100644 lib/qrwlock.c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC 1/2] qrwlock: A queue read/write lock implementation

2013-07-12 Thread Waiman Long
This patch introduces a read/write lock implementation that put waiting
readers and writers into a queue instead of actively contending the
lock like the regular read/write lock. This will improve performance in
highly contended situation by reducing the cache line bouncing effect.

In addition, the queue read/write lock is more deterministic even
though there is still a smaller chance for lock stealing if the reader
or writer comes at the right moment. Other than that, lock granting
is done in a FIFO manner. The only downside is the size increase in
the lock structure by 4 bytes for 32-bit systems and by 12 bytes for
64-bit systems.

This patch allows the replacement of architecture specific
implementation of read/write lock by this generic version of queue
read/write lock. Two new config parameters are introduced:

1. QUEUE_RWLOCK
   A select-able option that enables the building and replacement of
   architecture specific read/write lock.
2. ARCH_QUEUE_RWLOCK
   Have to be defined in arch/$(arch)/Kconfig to enable QUEUE_RWLOCK

In term of single-thread performance (no contention), a 256K
lock/unlock loop was run on a 2.4GHz Westmere x86-64 CPU. The following
table shows the average time for a single lock/unlock sequence:

Lock Type   Time (ns)
-   -
Ticket spinlock   15.7
Read lock 17.0
Write lock17.2
Queue read lock   31.1
Queue write lock  13.6

While the queue read lock is almost double the time of a read lock
or spinlock, the queue write lock is the fastest of them all. The
execution time can probably be reduced a bit by allowing inlining of
the lock fast paths like the other locks.

To see how the queue write lock can be used as a replacement for ticket
spinlock (just like rwsem can be used as replacement of mutex), the
mb_cache_spinlock in fs/mbcache.c, which is a bottleneck in the disk
workload (ext4 FS) of the AIM7 benchmark, was converted to both a queue
write lock and a regular write lock. When running on a 8-socket 80-core
DL980 system, the performance improvement was shown in the table below.

+-+++---+-+
|  Configuration  |  Mean JPM  |  Mean JPM  | Mean JPM  | qrwlock |
| |Vanilla 3.10|3.10-qrwlock|3.10-rwlock| %Change |
+-+---+
| |  User Range 10 - 100  |
+-+---+
| 8 nodes, HT off |   441374   |   532774   |  637205   | +20.7%  |
| 8 nodes, HT on  |   449373   |   584387   |  641965   | +30.1%  |
+-+---+
| |  User Range 200 - 1000|
+-+---+
| 8 nodes, HT off |   226870   |   354490   |  371593   | +56.3%  |
| 8 nodes, HT on  |   205997   |   314891   |  306378   | +52.9%  |
+-+---+
| |  User Range 1100 - 2000   |
+-+---+
| 8 nodes, HT off |   208234   |   321420   |  343694   | +54.4%  |
| 8 nodes, HT on  |   199612   |   297853   |  252569   | +49.2%  |
+-+++---+-+

Apparently, the regular read/write lock performs even better than
the queue read/write lock in some cases.  This is probably due to the
fact that mb_cache_spinlock is in a separate cache line from the data
being manipulated.

Looking at the fserver and new_fserver workloads (ext4 FS) where the
journal->j_state_lock (a read/write lock) is a bottleneck especially
when HT is on, we see a slight different story. The j_state_lock is
an embedded read/write lock which is in the same cacheline as some
of the data being manipulated. The replacement by a queue read/write
lock gives the following improvement.

++-+--+---+
|  Workload  |mean % change|mean % change |mean % change  |
||10-100 users |200-1000 users|1100-2000 users|
++-+--+---+
|fserver (HT off)|+0.3%|-0.1% |+1.9%  |
|fserver (HT on) |-0.1%|   +32.2% |   +34.7%  |
|new_fserver (HT on) |+0.8%|+0.9% |+0.9%  |
|new_fserver (HT off)|-1.2%|   +29.8% |   +40.5%  |
++-+--+---+

Signed-off-by: Waiman Long 
---
 include/asm-generic/qrwlock.h |  124 +
 lib/Kconfig   |   11 ++
 lib/Makefile  |1 +
 lib/qrwlock.c |  246 +
 4 files changed, 382 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-gene

Re: [PATCH] lockref: use cmpxchg64 explicitly for lockless updates

2013-09-19 Thread Waiman Long

On 09/19/2013 02:11 PM, Linus Torvalds wrote:

On Thu, Sep 19, 2013 at 1:06 PM, Will Deacon  wrote:

The cmpxchg() function tends not to support 64-bit arguments on 32-bit
architectures. This could be either due to use of unsigned long arguments
(like on ARM) or lack of instruction support (cmpxchgq on x86). However,
these architectures may implement a specific cmpxchg64() function to
provide 64-bit cmpxchg support instead

I'm certainly ok with this, but I wonder how much point there is to
use the cmpxchg alternatives for 32-bit architectures at all...

 From a performance standpoint, lockref really is expected to mainly
help with big machines. Only insane people would do big machines with
32-bit kernels these days.
Of course, it may be that cmpxchg is actually faster on some
architectures, but at least on x86-32, cmpxchg8b is traditionally
quite slow.

In other words, I'd actually like to see some numbers if there are
loads where this actually helps and matters...

Linus


I agreed that 32-bit machines are not likely to be big enough to get 
benefit from this feature. However, I do see a minor problem with the 
current code. If a user tries to turn on CMPXCHG_LOCKREF on a 32-bit 
build, he/she will get a compilation error because a 64-bit data type is 
not supported in 32-bit mode's cmpxchg(). Because of this, my original 
patch also uses cmpxchg64() which is equivalent to cmpxchg() in 64-bit 
machine for 64-bit data type.


I would suggest either change to use cmpxchg64() or add the dependence 
line "depends on 64BIT" to CMPXCHG_LOCKREF.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-09-04 Thread Waiman Long

On 09/03/2013 03:09 PM, Linus Torvalds wrote:

On Tue, Sep 3, 2013 at 8:34 AM, Linus Torvalds
  wrote:

I suspect the tty_ldisc_lock() could be made to go away if we care.

Heh. I just pulled the tty patches from Greg, and the locking has
changed completely.

It may actually fix your AIM7 test-case, because while the global
spinlock remains (it got renamed to "tty_ldiscs_lock" - there's an
added "s"), the common operations now take the per-tty lock to get the
ldisc for that tty, rather than that global spinlock (which just
protects the actual ldisk array now).

That said, I don't know what AIM7 really ends up doing, but your
profile seems to have every access through tty_ldisc_[de]ref() that
now uses only the per-tty lock. Of course, how much that helps ends up
depending on whether AIM7 uses lots of tty's or just one shared one.

Anyway, it might be worth testing my current -git tree.

   Linus


The latest tty patches did work. The tty related spinlock contention is 
now completely gone. The short workload can now reach over 8M JPM which 
is the highest I have ever seen.


The perf profile was:

5.85% reaim  reaim [.] mul_short
4.87% reaim  [kernel.kallsyms] [k] ebitmap_get_bit
4.72% reaim  reaim [.] mul_int
4.71% reaim  reaim [.] mul_long
2.67% reaim  libc-2.12.so  [.] __random_r
2.64% reaim  [kernel.kallsyms] [k] lockref_get_not_zero
1.58% reaim  [kernel.kallsyms] [k] copy_user_generic_string
1.48% reaim  [kernel.kallsyms] [k] mls_level_isvalid
1.35% reaim  [kernel.kallsyms] [k] find_next_bit
1.23% reaim  [kernel.kallsyms] [k] system_call
1.21% reaim  libc-2.12.so  [.] memcpy
1.19% reaim  [kernel.kallsyms] [k] _raw_spin_lock
1.06% reaim  [kernel.kallsyms] [k] avc_has_perm_flags
1.04% reaim  libc-2.12.so  [.] __srandom_r
1.02% reaim  reaim [.] newton_raphson
1.01% reaim  [kernel.kallsyms] [k] update_cfs_rq_blocked_load
0.98% reaim  [kernel.kallsyms] [k] fsnotify
0.94% reaim  [kernel.kallsyms] [k] avtab_search_node
0.91% reaim  libm-2.12.so  [.] __sincos

I have a patch in linux-next that should eliminate ebitmap_get_bit, 
mls_leve_isvalid and find_next_bit from the list top once it is merged.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:

 8.46% reaim  [kernel.kallsyms] [k] _raw_spin_lock
 |--42.21%-- d_path
 |  proc_pid_readlink
 |  SyS_readlinkat
 |  SyS_readlink
 |  system_call
 |  __GI___readlink
 |
 |--40.97%-- sys_getcwd
 |  system_call
 |  __getcwd

The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.

The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make
sure that no dentries will go away.

In addition, the dentry's d_lock is also not taken to further reduce
spinlock contention. However, this means that the name pointer and
other dentry fields may not be valid. As a result, the code was
enhanced to handle the case that the parent pointer or the name
pointer can be NULL.

With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.

When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |  118 +--
 1 files changed, 82 insertions(+), 36 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 96655f4..12d07d7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2512,7 +2512,19 @@ static int prepend(char **buffer, int *buflen, const 
char *str, int namelen)
 
 static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 {
-   return prepend(buffer, buflen, name->name, name->len);
+   /*
+* With RCU path tracing, it may race with rename. Use
+* ACCESS_ONCE() to make sure that it is either the old or
+* the new name pointer. The length does not really matter as
+* the sequence number check will eventually catch any ongoing
+* rename operation.
+*/
+   const char *dname = ACCESS_ONCE(name->name);
+   int   dlen = name->len;
+
+   if (unlikely(!dname || !dlen))
+   return -EINVAL;
+   return prepend(buffer, buflen, dname, dlen);
 }
 
 /**
@@ -2522,7 +2534,12 @@ static int prepend_name(char **buffer, int *buflen, 
struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * Caller holds the rename_lock.
+ * The function tries to write out the pathname without taking any lock other
+ * than the RCU read lock to make sure that dentries won't go away. It only
+ * checks the sequence number of the global rename_lock as any change in the
+ * dentry's d_seq will be preceded by changes in the rename_lock sequence
+ * number. If the sequence number had been change, it will restart the whole
+ * pathname back-tracing sequence again
  */
 static int prepend_path(const struct path *path,
const struct path *root,
@@ -2533,7 +2550,15 @@ static int prepend_path(const struct path *path,
struct mount *mnt = real_mount(vfsmnt);
bool slash = false;
int error = 0;
+   unsigned seq;
+   char *bptr;
+   int blen;
 
+restart:
+   bptr = *buffer;
+   blen = *buflen;
+   seq = read_seqbegin(&rename_lock);
+   rcu_read_lock();
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
 
@@ -2547,22 +2572,38 @@ static int prepend_path(const struct path *path,
continue;
}
parent = dentry->d_parent;
+   if (un

Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-09-04 Thread Waiman Long

On 09/04/2013 11:14 AM, Linus Torvalds wrote:

On Wed, Sep 4, 2013 at 7:52 AM, Waiman Long  wrote:

The latest tty patches did work. The tty related spinlock contention is now
completely gone. The short workload can now reach over 8M JPM which is the
highest I have ever seen.

Good. And this was with the 80-core machine, so there aren't any
scalability issues hiding?

  Linus


Yes, the perf profile was taking from an 80-core machine. There isn't 
any scalability issue hiding for the short workload on an 80-core machine.


However, I am certain that more may pop up when running in an even 
larger machine like the prototype 240-core machine that our team has 
been testing on.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long

On 09/04/2013 03:05 PM, Waiman Long wrote:

When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:

  8.46% reaim  [kernel.kallsyms] [k] _raw_spin_lock
  |--42.21%-- d_path
  |  proc_pid_readlink
  |  SyS_readlinkat
  |  SyS_readlink
  |  system_call
  |  __GI___readlink
  |
  |--40.97%-- sys_getcwd
  |  system_call
  |  __getcwd

The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.

The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make
sure that no dentries will go away.

In addition, the dentry's d_lock is also not taken to further reduce
spinlock contention. However, this means that the name pointer and
other dentry fields may not be valid. As a result, the code was
enhanced to handle the case that the parent pointer or the name
pointer can be NULL.

With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.

When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.

Signed-off-by: Waiman Long

In term of AIM7 performance, this patch has a performance boost of
about 6-7% on top of Linus' lockref patch on a 8-socket 80-core DL980.

User Range  |   10-100  | 200-1 | 1100-2000 |
Mean JPM w/o patch  | 4,365,114 | 7,211,115 | 6,964,439 |
Mean JPM with patch | 3,872,850 | 7,655,958 | 7,422,598 |
% Change|  -11.3%   |   +6.2%   |   +6.6%   |

I am not sure if it is too aggresive for not taking the d_lock before
prepend_name(). I can add back those locking instructions if necessary.

Regards,
Longman


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long

On 09/04/2013 03:11 PM, Al Viro wrote:

On Wed, Sep 04, 2013 at 03:05:23PM -0400, Waiman Long wrote:


  static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  {
-   return prepend(buffer, buflen, name->name, name->len);
+   /*
+* With RCU path tracing, it may race with rename. Use
+* ACCESS_ONCE() to make sure that it is either the old or
+* the new name pointer. The length does not really matter as
+* the sequence number check will eventually catch any ongoing
+* rename operation.
+*/
+   const char *dname = ACCESS_ONCE(name->name);
+   int   dlen = name->len;
+
+   if (unlikely(!dname || !dlen))
+   return -EINVAL;
+   return prepend(buffer, buflen, dname, dlen);

NAK.  A race with d_move() can very well leave you with dname pointing into
an object of length smaller than dlen.  You *can* copy it byte-by-byte
and rely on NUL-termination, but you can't rely on length being accurate -
not without having excluded d_move().


I have thought about that. But if a d_move() is going on, the string in 
the buffer will be discarded as the sequence number will change. So 
whether or not it have embedded null byte shouldn't matter. That is why 
I didn't add code to do byte-by-byte copy at this first patch. I can add 
code to do that if you think it is safer to do so.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long

On 09/04/2013 03:43 PM, Al Viro wrote:

On Wed, Sep 04, 2013 at 03:33:00PM -0400, Waiman Long wrote:


I have thought about that. But if a d_move() is going on, the string
in the buffer will be discarded as the sequence number will change.
So whether or not it have embedded null byte shouldn't matter. That
is why I didn't add code to do byte-by-byte copy at this first
patch. I can add code to do that if you think it is safer to do so.

Sigh...  Junk in the output is not an issue; reading from invalid address
is, since you might not survive to the sequence number check.  Again,
if p is an address returned by kmalloc(size, ...), dereferencing p + offset
is not safe unless offset is less than size.


Yeah, I understand that. As said in my reply to Linus, I will use 
memchr() to see if there is null byte within the specified length. If 
one is found, I will assume the string is not valid and return error to 
the caller.


Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long

On 09/04/2013 04:40 PM, John Stoffel wrote:

"Waiman" == Waiman Long  writes:

Waiman>  In term of AIM7 performance, this patch has a performance boost of
Waiman>  about 6-7% on top of Linus' lockref patch on a 8-socket 80-core DL980.

Waiman>  User Range  |   10-100  | 200-1 | 1100-2000 |
Waiman>  Mean JPM w/o patch  | 4,365,114 | 7,211,115 | 6,964,439 |
Waiman>  Mean JPM with patch | 3,872,850 | 7,655,958 | 7,422,598 |
Waiman>  % Change|  -11.3%   |   +6.2%   |   +6.6%   |

This -11% impact is worisome to me, because at smaller numbers of
users, I would still expect the performance to go up.  So why the big
drop?

Also, how is the impact of these changes on smaller 1 socket, 4 core
systems?  Just because it helps a couple of big boxes, doesn't mean it
won't hurt the more common small case.

John


I don't believe the patch will make it slower with less user. It is more 
a result of run-to-run variation. The short workload typically completed 
in a very short time. In the 10-100 user range, the completion times 
range from 0.02-0.11s. With a higher user count, it needs several 
seconds to run and hence the results are more reliable.


Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-04 Thread Waiman Long

On 09/04/2013 05:31 PM, Linus Torvalds wrote:

On Wed, Sep 4, 2013 at 12:05 PM, Waiman Long  wrote:

+   rcu_read_unlock();
+   if (read_seqretry(&rename_lock, seq))
+   goto restart;

Btw, you have this pattern twice, and while it's not necessarily
incorrect, it's a bit worrisome for performance.


The rcu_read_unlock sequence in the middle of prepend_path() is not 
likely to executed. So it shouldn't affect performance exception for the 
conditional check.



The rcu_read_unlock() is very possibly going to trigger an immediate
scheduling event, so checking the sequence lock after dropping the
read-lock sounds like it would make it much easier to hit the race
with some rename.


I can put read_seqbegin/read_seqretry within the 
rcu_read_lock/rcu_read_unlock block. However, read_seqbegin() can spin 
for a while if a rename is in progress. So it depends on which is more 
important, a shorter RCU critical section at the expense of more retries 
or vice versa.



I'm also a tiny bit worried about livelocking on the sequence lock in
the presence of lots of renames, so I'm wondering if the locking
should try to approximate what we do for the RCU lookup path: start
off optimistically using just the RCU lock and a sequence point, but
if that fails, fall back to taking the real lock. Maybe using a
counter ("try twice, then get the rename lock for writing")

Hmm?


Yes, I can implement a counter that switch to taking the rename_lock if 
the count reaches 0. It shouldn't be too hard. That should avoid the 
possibility of a livelock.


Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-09-04 Thread Waiman Long

On 09/04/2013 05:34 PM, Linus Torvalds wrote:

On Wed, Sep 4, 2013 at 12:25 PM, Waiman Long  wrote:

Yes, the perf profile was taking from an 80-core machine. There isn't any
scalability issue hiding for the short workload on an 80-core machine.

However, I am certain that more may pop up when running in an even larger
machine like the prototype 240-core machine that our team has been testing
on.

Sure. Please let us know, I think it's going to be interesting to see
what that shows.

SGI certainly did much larger machines, but their primary target
tended to be all user space, so they had things like "tons of
concurrent page faults in the same process" rather than filename
lookup or the tty layer.

 Linus


I think SGI is more focus on compute-intensive workload. HP is more 
focus on high-end commercial workload like SAP HANA. Below was a sample 
perf profile of the high-systime workload on a 240-core prototype 
machine (HT off) with 3.10-rc1 kernel with my lockref and seqlock patches:


 9.61%3382925  swapper  [kernel.kallsyms] [k] 
_raw_spin_lock

   |--59.90%-- rcu_process_callbacks
   |--19.41%-- load_balance
   |--9.58%-- rcu_accelerate_cbs
   |--6.70%-- tick_do_update_jiffies64
   |--1.46%-- scheduler_tick
   |--1.17%-- sched_rt_period_timer
   |--0.56%-- perf_adjust_freq_unthr_context
--1.21%-- [...]

 6.34% 99reaim  [kernel.kallsyms] [k] 
_raw_spin_lock

 |--73.96%-- load_balance
 |--11.98%-- rcu_process_callbacks
 |--2.21%-- __mutex_lock_slowpath
 |--2.02%-- rcu_accelerate_cbs
 |--1.95%-- wake_up_new_task
 |--1.70%-- scheduler_tick
 |--1.67%-- xfs_alloc_log_agf
 |--1.24%-- task_rq_lock
 |--1.15%-- try_to_wake_up
  --2.12%-- [...]

 5.39%  2reaim  [kernel.kallsyms] [k] 
_raw_spin_lock_irqsave

 |--95.08%-- rwsem_wake
 |--1.80%-- rcu_process_callbacks
 |--1.03%-- prepare_to_wait
 |--0.59%-- __wake_up
  --1.50%-- [...]

 2.28%  1reaim  [kernel.kallsyms] [k] 
_raw_spin_lock_irq

 |--90.56%-- rwsem_down_write_failed
 |--9.25%-- __schedule
  --0.19%-- [...]

Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long

On 09/05/2013 12:30 AM, George Spelvin wrote:

As long as you're removing locks from prepend_name and complicating its
innards, I notice that each and every call site follows it by prepending
"/".  How about moving that into prepend_name as well?

Also, if you happen to feel like it, you can delete the slash flag
and replace it with "bptr != *buffer".

Another small tweak would be to the global_root part of the code.
You could move the is_mounted(vfsmnt) test up, and combine the tail of
that code path with the regular exit.  All you have to do is change
the !slash test to:

if (error>= 0&&  bptr == *buffer) {  /* Root directory */
if (--blen<  0)
error = -ENAMETOOLONG;
else
*--bptr = '/';
}

This modified form is no more code than an inlined copy of prepend(),
so we haven't actually slowed the fast path, but it avoids corrupting
the return value of 0/1/2 if possible.


Thank for the suggestions. I will implement them in my v2 patch.

-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-09-05 Thread Waiman Long

On 09/05/2013 09:31 AM, Ingo Molnar wrote:

* Waiman Long  wrote:



The latest tty patches did work. The tty related spinlock contention
is now completely gone. The short workload can now reach over 8M JPM
which is the highest I have ever seen.

The perf profile was:

5.85% reaim  reaim [.] mul_short
4.87% reaim  [kernel.kallsyms] [k] ebitmap_get_bit
4.72% reaim  reaim [.] mul_int
4.71% reaim  reaim [.] mul_long
2.67% reaim  libc-2.12.so  [.] __random_r
2.64% reaim  [kernel.kallsyms] [k] lockref_get_not_zero
1.58% reaim  [kernel.kallsyms] [k] copy_user_generic_string
1.48% reaim  [kernel.kallsyms] [k] mls_level_isvalid
1.35% reaim  [kernel.kallsyms] [k] find_next_bit

6%+ spent in ebitmap_get_bit() and mls_level_isvalid() looks like
something worth optimizing.

Is that called very often, or is it perhaps cache-bouncing for some
reason?


The high cycle count is due more to inefficient algorithm in the 
mls_level_isvalid() function than cacheline contention in the code. The 
attached patch should address this problem. It is in linux-next and 
hopefully will be merged in 3.12.


-Longman
>From 232a29fd04d5345d6af3330f48710cd48a345c10 Mon Sep 17 00:00:00 2001
From: Waiman Long 
Date: Mon, 10 Jun 2013 13:52:36 -0400
Subject: [PATCH 1/2 v5] SELinux: Reduce overhead of mls_level_isvalid() 
function call
To: Stephen Smalley ,
James Morris ,
Eric Paris 
Cc: linux-security-mod...@vger.kernel.org,
linux-kernel@vger.kernel.org,
"Chandramouleeswaran, Aswin" ,
"Norton, Scott J" 

v4->v5:
  - Fix scripts/checkpatch.pl warning.

v3->v4:
  - Merge the 2 separate while loops in ebitmap_contains() into
a single one.

v2->v3:
  - Remove unused local variables i, node from mls_level_isvalid().

v1->v2:
 - Move the new ebitmap comparison logic from mls_level_isvalid()
   into the ebitmap_contains() helper function.
 - Rerun perf and performance tests on the latest v3.10-rc4 kernel.

While running the high_systime workload of the AIM7 benchmark on
a 2-socket 12-core Westmere x86-64 machine running 3.10-rc4 kernel
(with HT on), it was found that a pretty sizable amount of time was
spent in the SELinux code. Below was the perf trace of the "perf
record -a -s" of a test run at 1500 users:

  5.04%ls  [kernel.kallsyms] [k] ebitmap_get_bit
  1.96%ls  [kernel.kallsyms] [k] mls_level_isvalid
  1.95%ls  [kernel.kallsyms] [k] find_next_bit

The ebitmap_get_bit() was the hottest function in the perf-report
output.  Both the ebitmap_get_bit() and find_next_bit() functions
were, in fact, called by mls_level_isvalid(). As a result, the
mls_level_isvalid() call consumed 8.95% of the total CPU time of
all the 24 virtual CPUs which is quite a lot. The majority of the
mls_level_isvalid() function invocations come from the socket creation
system call.

Looking at the mls_level_isvalid() function, it is checking to see
if all the bits set in one of the ebitmap structure are also set in
another one as well as the highest set bit is no bigger than the one
specified by the given policydb data structure. It is doing it in
a bit-by-bit manner. So if the ebitmap structure has many bits set,
the iteration loop will be done many times.

The current code can be rewritten to use a similar algorithm as the
ebitmap_contains() function with an additional check for the
highest set bit. The ebitmap_contains() function was extended to
cover an optional additional check for the highest set bit, and the
mls_level_isvalid() function was modified to call ebitmap_contains().

With that change, the perf trace showed that the used CPU time drop
down to just 0.08% (ebitmap_contains + mls_level_isvalid) of the
total which is about 100X less than before.

  0.07%ls  [kernel.kallsyms] [k] ebitmap_contains
  0.05%ls  [kernel.kallsyms] [k] ebitmap_get_bit
  0.01%ls  [kernel.kallsyms] [k] mls_level_isvalid
  0.01%ls  [kernel.kallsyms] [k] find_next_bit

The remaining ebitmap_get_bit() and find_next_bit() functions calls
are made by other kernel routines as the new mls_level_isvalid()
function will not call them anymore.

This patch also improves the high_systime AIM7 benchmark result,
though the improvement is not as impressive as is suggested by the
reduction in CPU time spent in the ebitmap functions. The table below
shows the performance change on the 2-socket x86-64 system (with HT
on) mentioned above.

+--+---++-+
|   Workload   | mean % change | mean % change  | mean % change   |
|  | 10-100 users  | 200-1000 users | 1100-2000 users |
+--+---++-+
| high_systime | +0.1% | +0.9%  | +2.6%   |
+--+---+-----

Re: [PATCH] lockref: remove cpu_relax() again

2013-09-05 Thread Waiman Long

On 09/05/2013 11:31 AM, Linus Torvalds wrote:

On Thu, Sep 5, 2013 at 6:18 AM, Heiko Carstens
  wrote:

*If* however the cpu_relax() makes sense on other platforms maybe we could
add something like we have already with "arch_mutex_cpu_relax()":

I actually think it won't.

The lockref cmpxchg isn't waiting for something to change - it only
loops _if_ something has changed, and rather than cpu_relax(), we most
likely want to try to take advantage of the fact that we have the
changed data in our exclusive cacheline, and try to get our ref update
out as soon as possible.

IOW, the lockref loop is not an idle loop like a spinlock "wait for
lock to be released", it's very much an active loop of "oops,
something changed".

And there can't be any livelock, since by definition somebody else
_did_ make progress. In fact, adding the cpu_relax() probably just
makes things much less fair - once somebody else raced on you, the
cpu_relax() now makes it more likely that _another_ cpu does so too.

That said, let's see Tony's numbers are. On x86, it doesn't seem to
matter, but as Tony noticed, the variability can be quite high (for
me, the numbers tend to be quite stable when running the test program
multiple times in a loop, but then variation between boots or after
having done something else can be quite big - I suspect the cache
access patterns end up varying wildly with different dentry layout and
hash chain depth).

   Linus

I have found that having a cpu_relax() at the bottom of the while
loop in CMPXCHG_LOOP() macro does help performance in some case on
x86-64 processors. I saw no noticeable difference for the AIM7's
short workload. On the high_systime workload, however, I saw about 5%
better performance with cpu_relax(). Below were the perf profile of
the 2 high_systime runs at 1500 users on a 80-core DL980 with HT off.

Without cpu_relax():

 4.60% ls  [kernel.kallsyms] [k] _raw_spin_lock
  |--48.50%-- lockref_put_or_lock
  |--46.97%-- lockref_get_or_lock
  |--1.04%-- lockref_get

 2.95% reaim  [kernel.kallsyms] [k] _raw_spin_lock
  |--29.70%-- lockref_put_or_lock
  |--28.87%-- lockref_get_or_lock
  |--0.63%-- lockref_get


With cpu_relax():

 1.67% reaim  [kernel.kallsyms] [k] _raw_spin_lock
  |--14.80%-- lockref_put_or_lock
  |--14.04%-- lockref_get_or_lock

 1.36% ls  [kernel.kallsyms] [k] _raw_spin_lock
  |--44.94%-- lockref_put_or_lock
  |--43.12%-- lockref_get_or_lock

So I would suggest having some kind of conditional cpu_relax() in
the loop.

-Longman



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:

 8.46% reaim  [kernel.kallsyms] [k] _raw_spin_lock
 |--42.21%-- d_path
 |  proc_pid_readlink
 |  SyS_readlinkat
 |  SyS_readlink
 |  system_call
 |  __GI___readlink
 |
 |--40.97%-- sys_getcwd
 |  system_call
 |  __getcwd

The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.

The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.

To further reduce spinlock contention, this patch also tries not
to take the dentry's d_lock if the dentry name is internal. For
external dentry name, it is safer to take the d_lock as racing with
d_move() may cause it to access invalid memory location leading to
segmentation fault.  For safety, there is also additional check to
see if the name string is valid (no embedded null).

The following code re-factoring are also made:
1. Move prepend('/') into prepend_name().
2. Move the global root check in prepend_path() back to the top of
   the while loop.

With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.

When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |  213 ++-
 1 files changed, 151 insertions(+), 62 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 96655f4..9703aa6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2510,9 +2510,57 @@ static int prepend(char **buffer, int *buflen, const 
char *str, int namelen)
return 0;
 }
 
-static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+static int prepend_name(char **buffer, int *buflen, struct dentry *dentry)
 {
-   return prepend(buffer, buflen, name->name, name->len);
+   /*
+* With RCU path tracing, it may race with rename. Use
+* ACCESS_ONCE() to make sure that it is either the old or
+* the new name pointer. The length does not really matter as
+* the sequence number check will eventually catch any ongoing
+* rename operation.
+*/
+   const char *dname = ACCESS_ONCE(dentry->d_name.name);
+   u32 dlen = dentry->d_name.len;
+   int error;
+
+   if (likely(dname == (const char *)dentry->d_iname)) {
+   /*
+* Internal dname, the string memory is valid as long
+* as its length is not over the limit.
+*/
+   if (unlikely(dlen > sizeof(dentry->d_iname)))
+   return -EINVAL;
+   } else if (!dname)
+   return -EINVAL;
+   else {
+   const char *ptr;
+   u32 len;
+
+   /*
+* External dname, need to fetch name pointer and length
+* again under d_lock to get a consistent set and avoid
+* racing with d_move() which will take d_lock before
+* acting on the dentries.
+*/
+   spin_lock(&dentry->d_lock);
+   dname = dentry->d_name.name;
+   dlen  = dentry->d_name.len;
+   spin_unlock(&dentry->d_lock);
+
+   if (unlikely(!dname || !dlen))
+ 

[PATCH v2 0/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long
Change History

v1->v2:
 - Check for internal vs external dname, taking d_lock only for
   external dname for safety.
 - Replace memchr() by a byte-by-byte checking for loop.
 - Try lockless dentry to pathname conversion 3 times before falling
   back to taking the rename_lock to prevent live-lock.
 - Make code re-factoring suggested by George Spelvin.

Waiman Long (1):
  dcache: Translating dentry into pathname without taking rename_lock

 fs/dcache.c |  213 ++-
 1 files changed, 151 insertions(+), 62 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long

On 09/05/2013 03:35 PM, Linus Torvalds wrote:

No. Stop all these stupid games.

No d_lock, no trying to make d_name/d_len consistent.

No "compare d_name against d_iname".

No EINVAL.

No idiotic racy "let's fetch each byte one-by one and test them
against NUL", which is just racy and stupid.

Just do what I told you to do: *copy* the name one byte at a time, and
stop copying if you hit NUL. IOW, make prepend() just use "strncpy()"
instead of "memcpy()". You don't even need to check the end result -
if it's bogus, the sequence count will fix it.

No special cases. No games. No crap. Just make "prepend()" able to
handle the special case of "oops, the name length didn't match, but we
must not ever go past the end of the buffer".

We can optimize strncpy() to use word accesses if it ends up ever
being a performance issue. I suspect it never even shows up, but it's
not hard to do if if does.

 Linus


It is not as simple as doing a strncpy(). The pathname was built from 
the leaf up to the root, and from the end of buffer toward the 
beginning. As it goes through the while loop, the buffer will look like:


"/c"
"  /b/c"
"/a/b/c"

If the content of the string is unreliable, I have to do at least 2 passes:
1) Locate the end of the string and determine the actual length
2) Copy the whole string or byte-by-byte backward

That is why I am looking for the null byte. Using strncpy() alone won't 
work. However, if the actual length doesn't match that of the dlen, the 
name string will be invalid and there is no point in proceeding any further.


I also consider the possible, but unlikely, scenario that during a 
rename operation, a short old name could be freed and replaced by a long 
new name. The old name could then be allocated to another user filling 
it up with non-NULL byte. If the buffer happen to be the end of 
valid-to-invalid memory page boundary, the code may step over to an 
invalid address by looking for the null byte. The current code probably 
won't free the buffer while the RCU lock is being hold, but future code 
change may make this assumption not valid. Blindly taking the d_lock may 
be too expensive as the original code does. That is why I come up with 
the internal vs. external dname check and take the lock only for 
external dname for safety.


I can change the code to do just locating the end of string and copying 
it backward, but not using strncpy().


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long

On 09/05/2013 04:04 PM, Al Viro wrote:

On Thu, Sep 05, 2013 at 02:55:16PM -0400, Waiman Long wrote:

+   const char *dname = ACCESS_ONCE(dentry->d_name.name);
+   u32 dlen = dentry->d_name.len;
+   int error;
+
+   if (likely(dname == (const char *)dentry->d_iname)) {
+   /*
+* Internal dname, the string memory is valid as long
+* as its length is not over the limit.
+*/
+   if (unlikely(dlen>  sizeof(dentry->d_iname)))
+   return -EINVAL;
+   } else if (!dname)
+   return -EINVAL;

Can't happen.

+   else {
+   const char *ptr;
+   u32 len;
+
+   /*
+* External dname, need to fetch name pointer and length
+* again under d_lock to get a consistent set and avoid
+* racing with d_move() which will take d_lock before
+* acting on the dentries.
+*/
+   spin_lock(&dentry->d_lock);
+   dname = dentry->d_name.name;
+   dlen  = dentry->d_name.len;
+   spin_unlock(&dentry->d_lock);
+
+   if (unlikely(!dname || !dlen))
+   return -EINVAL;

Can't happen.


+   /*
+* As the length and the content of the string may not be
+* valid, need to scan the string and return EINVAL if there
+* is embedded null byte within the length of the string.
+*/
+   for (ptr = dname, len = dlen; len; len--, ptr++) {
+   if (*ptr == '\0')
+   return -EINVAL;

Egads...  First of all, this is completely pointless - if you've grabbed
->d_name.name and ->d_name.len under ->d_lock, you don't *need* that crap.
At all.  The whole point of that exercise is to avoid taking ->d_lock;
_that_ is where the "read byte by byte until you hit NUL" comes from.
And if you do that, you can bloody well just go ahead and store them in
the target array *as* *you* *go*.  No reason to bother with memcpy()
afterwards.


That is what I thought too. I am just not totally sure about it. So yes, 
I can scrap all these additional check.


As the internal dname buffer is at least 32 bytes, most dentries will 
use the internal buffer instead of allocating from kmem. IOW, the d_lock 
taking code path is unlikely to be used.



Damnit, just grab len and name (no ->d_lock, etc.).  Check if you've got
enough space in the buffer, treat "not enough" as an overflow.  Then
proceed to copy the damn thing over there (starting at *buffer -= len)
byte by byte, stopping when you've copied len bytes *or* when the byte you've
got happens to be NUL.  Don't bother with EINVAL, etc. - just return to
caller and let rename_lock logics take care of the races.  That's it - nothing
more is needed.


OK, I will do that.

-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-05 Thread Waiman Long

On 09/05/2013 04:42 PM, Linus Torvalds wrote:

On Thu, Sep 5, 2013 at 1:29 PM, Waiman Long  wrote:

It is not as simple as doing a strncpy().

Yes it damn well is.

Stop the f*cking stupid arguments, and instead listen to what I say.

Here. Let me bold-face the most important part for you, so that you
don't miss it in all the other crap:

MAKE prepend() JUST USE "strncpy()" INSTEAD OF "memcpy()".

Nothing else. Seriously. Your "you can't do it because we copy
backwards" arguments are pure and utter garbage, exactly BECAUSE YOU
DON'T CHANGE ANY OF THAT. You can actually use the unreliable length
variable BUT YOU MUST STILL STOP AT A ZERO.

Get it?

You're complicating the whole thing for no good reason. I'm telling
you (and HAVE BEEN telling you multiple times) that you cannot use
"memcpy()" because the length may not be reliable, so you need to
check for zero in the middle and stop early. All your arguments have
been totally pointless, because you don't seem to see that simple and
fundamental issue. You don't change ANYTHING else. But you damn well
not do a "memcpy", you do something that stops when it hits a NUL
character.

We call that function "strncpy()". I'd actually prefer to write it out
by hand (because somebody could implement "strncpy()" as a
questionable function that accesses past the NUL as long as it's
within the 'n'), and because I think we might want to do that
word-at-a-time version of it, but for a first approximation, just do
that one-liner version.

Don't do anything else. Don't do locking. Don't do memchr. Just make
sure that you stop at a NUL character, and don't trust the length,
because the length may not match the pointer. That's was always ALL
you needed to do.

   Linus
I am sorry that I misunderstand what you said. I will do what you and Al 
advise me to do.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-06 Thread Waiman Long
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:

 8.46% reaim  [kernel.kallsyms] [k] _raw_spin_lock
 |--42.21%-- d_path
 |  proc_pid_readlink
 |  SyS_readlinkat
 |  SyS_readlink
 |  system_call
 |  __GI___readlink
 |
 |--40.97%-- sys_getcwd
 |  system_call
 |  __getcwd

The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.

The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.

To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.

The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
   check.
2. Move the global root check in prepend_path() back to the top of
   the while loop.

With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.

When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |  178 ++
 1 files changed, 116 insertions(+), 62 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 96655f4..da095ce 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -89,6 +89,12 @@ EXPORT_SYMBOL(rename_lock);
 static struct kmem_cache *dentry_cache __read_mostly;
 
 /*
+ * Try 3 times of lockless dentry to pathname conversion before falling
+ * back to take the rename_lock.
+ */
+#defineD_LOCKLESS_PREPEND_RETRY3
+
+/*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
  * to make this good - I've just made it work.
@@ -2512,7 +2518,33 @@ static int prepend(char **buffer, int *buflen, const 
char *str, int namelen)
 
 static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 {
-   return prepend(buffer, buflen, name->name, name->len);
+   /*
+* With RCU path tracing, it may race with d_move(). Use
+* ACCESS_ONCE() to make sure that either the old or the new name
+* pointer and length are fetched. However, there may be mismatch
+* between length and pointer. The length cannot be trusted, we need
+* to copy it byte-by-byte until the length is reached or a null
+* byte is found. It also prepends "/" at the beginning of the name.
+* The sequence number check at the caller will retry it again when
+* a d_move() does happen. So any garbage in the buffer due to
+* mismatched pointer and length will be discarded.
+*/
+   const char *dname = ACCESS_ONCE(name->name);
+   u32 dlen = ACCESS_ONCE(name->len);
+   char *p;
+
+   if (*buflen < dlen + 1)
+   return -ENAMETOOLONG;
+   *buflen -= dlen + 1;
+   p = *buffer -= dlen + 1;
+   *p++ = '/';
+   while (dlen--) {
+   char c = *dname++;

[PATCH v3 0/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-06 Thread Waiman Long
Change History

v2->v3:
 - Simplify prepend_name() to blindly copy the dname over until it
   reaches a null byte or the specified length leaving the sequence
   check to handle error case.

v1->v2:
 - Check for internal vs external dname, taking d_lock only for
   external dname for safety.
 - Replace memchr() by a byte-by-byte checking for loop.
 - Try lockless dentry to pathname conversion 3 times before falling
   back to taking the rename_lock to prevent live-lock.
 - Make code re-factoring suggested by George Spelvin.

Waiman Long (1):
  dcache: Translating dentry into pathname without taking rename_lock

 fs/dcache.c |  178 ++
 1 files changed, 116 insertions(+), 62 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-06 Thread Waiman Long

On 09/06/2013 04:52 PM, Linus Torvalds wrote:

On Fri, Sep 6, 2013 at 9:08 AM, Waiman Long  wrote:

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions.

Ok, this actually looks really good.

I do have one comment, from just reading the patch:

I would really like the stuff inside the

restart:
   bptr = *buffer;
   blen = *buflen;
   if (retry_cnt) {
 seq = read_seqbegin(&rename_lock);
 rcu_read_lock();
   } else
 write_seqlock(&rename_lock);

   ... guts of path generation ...

   if (retry_cnt) {
 retry_cnt--;
 rcu_read_unlock();
 if (read_seqretry(&rename_lock, seq))
   goto restart;
   } else
 write_sequnlock(&rename_lock);

could possible be done as a separate function?

Alternatively (or perhaps additionally), maybe the locking could be
done as an inline function too (taking the retry count as an argument)
to make things a bit more easy to understand.


I would prefer putting the begin and end blocks into 2 helper inlined 
helper functions to make code easier to look at. I will work on this 
over the weekend.


-Longman

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/07/2013 02:07 PM, Al Viro wrote:

On Sat, Sep 07, 2013 at 10:52:02AM -0700, Linus Torvalds wrote:


So I think we could make a more complicated data structure that looks
something like this:

struct seqlock_retry {
   unsigned int seq_no;
   int state;
};

and pass that around. Gcc should do pretty well, especially if we
inline things (but even if not, small structures that fit in 64 bytes
generate reasonable code even on 32-bit targets, because gcc knows
about using two registers for passing data around)..

Then you can make "state" have a retry counter in it, and have a
negative value mean "I hold the lock for writing". Add a couple of
helper functions, and you can fairly easily handle the mixed "try for
reading first, then fall back to writing".

That said, __d_lookup() still shows up as very performance-critical on
some loads (symlinks in particular cause us to fall out of the RCU
cases) so I'd like to keep that using the simple pure read case. I
don't believe you can livelock it, as mentioned. But the other ones
might well be worth moving to a "fall back to write-locking after
tries" model. They might all traverse user-specified paths of fairly
arbitrary depth, no?

So this "seqlock_retry" thing wouldn't _replace_ bare seqlocks, it
would just be a helper thing for this kind of behavior where we want
to normally do things with just the read-lock, but want to guarantee
that we don't live-lock.

Sounds reasonable?

More or less; I just wonder if we are overdesigning here - if we don't
do "repeat more than once", we can simply use the lower bit of seq -
read_seqlock() always returns an even value.  So we could do something
like seqretry_and_lock(lock,&seq):
if ((*seq&  1) || !read_seqretry(lock, *seq))
return true;
*seq |= 1;
write_seqlock(lock);
return false;
and seqretry_done(lock, seq):
if (seq&  1)
write_sequnlock(lock);
with these loops turning into
seq = read_seqlock(&rename_lock);
...
if (!seqretry_and_lock(&rename_lock,&seq))
goto again;
...
seqretry_done(&rename_lock);


I am fine with try it once and then lock instead of trying it a few 
times. Now are you planning to have these helper functions for the 
dcache layer only or as part of the seqlock infrastructure? If we are 
going to touch the seqlock layer, I would suggest adding a blocking 
reader that takes the lock, but won't update the sequence number so that 
it won't block other sequence readers as my original seqlock patch was 
doing.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long
When running the AIM7's short workload, Linus' lockref patch eliminated
most of the spinlock contention. However, there were still some left:

 8.46% reaim  [kernel.kallsyms] [k] _raw_spin_lock
 |--42.21%-- d_path
 |  proc_pid_readlink
 |  SyS_readlinkat
 |  SyS_readlink
 |  system_call
 |  __GI___readlink
 |
 |--40.97%-- sys_getcwd
 |  system_call
 |  __getcwd

The big one here is the rename_lock (seqlock) contention in d_path()
and the getcwd system call. This patch will eliminate the need to take
the rename_lock while translating dentries into the full pathnames.

The need to take the rename_lock is to make sure that no rename
operation can be ongoing while the translation is in progress. However,
only one thread can take the rename_lock thus blocking all the other
threads that need it even though the translation process won't make
any change to the dentries.

This patch will replace the writer's write_seqlock/write_sequnlock
sequence of the rename_lock of the callers of the prepend_path() and
__dentry_path() functions with the reader's read_seqbegin/read_seqretry
sequence within these 2 functions. As a result, the code will have to
retry if one or more rename operations had been performed. In addition,
RCU read lock will be taken during the translation process to make sure
that no dentries will go away. To prevent live-lock from happening,
the code will switch back to take the rename_lock if read_seqretry()
fails for three times.

To further reduce spinlock contention, this patch does not take the
dentry's d_lock when copying the filename from the dentries. Instead,
it treats the name pointer and length as unreliable and just copy
the string byte-by-byte over until it hits a null byte or the end of
string as specified by the length. This should avoid stepping into
invalid memory address. The error cases are left to be handled by
the sequence number check.

The following code re-factoring are also made:
1. Move prepend('/') into prepend_name() to remove one conditional
   check.
2. Move the global root check in prepend_path() back to the top of
   the while loop.

With this patch, the _raw_spin_lock will now account for only 1.2%
of the total CPU cycles for the short workload. This patch also has
the effect of reducing the effect of running perf on its profile
since the perf command itself can be a heavy user of the d_path()
function depending on the complexity of the workload.

When taking the perf profile of the high-systime workload, the amount
of spinlock contention contributed by running perf without this patch
was about 16%. With this patch, the spinlock contention caused by
the running of perf will go away and we will have a more accurate
perf profile.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |  196 ---
 1 files changed, 133 insertions(+), 63 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index ca8e9cd..8186ff9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -88,6 +88,44 @@ EXPORT_SYMBOL(rename_lock);
 
 static struct kmem_cache *dentry_cache __read_mostly;
 
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * lock: sequence lock
+ * seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+   if (!(*seq & 1)) {  /* Even */
+   *seq = read_seqbegin(lock);
+   rcu_read_lock();
+   } else  /* Odd */
+   write_seqlock(lock);
+}
+
+/**
+ * read_seqretry_or_unlock - end a seqretry or lock block & return retry status
+ * lock : sequence lock
+ * seq  : sequence number
+ * Return: 1 to retry operation again, 0 to continue
+ */
+static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq)
+{
+   if (!(*seq & 1)) {  /* Even */
+   rcu_read_unlock();
+   if (read_seqretry(lock, *seq)) {
+   (*seq)++;   /* Take writer lock */
+   return 1;
+   }
+   } else  /* Odd */
+   write_sequnlock(lock);
+   return 0;
+}
+
 /*
  * This is the single most critical data structure when it comes
  * to the dcache: the hashtable for lookups. Somebody should try
@@ -2647,9 +2685,39 @@ static int prepend(char **buffer, int *buflen, const 
char *str, int namelen)
return 0;
 }
 
+/**
+ * prepend_name - prepend a pathname i

[PATCH v4 0/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long
Change History

v3->v4:
 - Extract the begin and end blocks of the rename_lock sequence number
   check into helper functions to make the code easier to read.

v2->v3:
 - Simplify prepend_name() to blindly copy the dname over until it
   reaches a null byte or the specified length leaving the sequence
   check to handle error case.

v1->v2:
 - Check for internal vs external dname, taking d_lock only for
   external dname for safety.
 - Replace memchr() by a byte-by-byte checking for loop.
 - Try lockless dentry to pathname conversion 3 times before falling
   back to taking the rename_lock to prevent live-lock.
 - Make code re-factoring suggested by George Spelvin.

Waiman Long (1):
  dcache: Translating dentry into pathname without taking rename_lock

 fs/dcache.c |  196 ---
 1 files changed, 133 insertions(+), 63 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/09/2013 01:45 PM, Linus Torvalds wrote:

On Mon, Sep 9, 2013 at 10:29 AM, Al Viro  wrote:

I'm not sure I like mixing rcu_read_lock() into that - d_path() and friends
can do that themselves just fine (it needs to be taken when seq is even),
and e.g. d_walk() doesn't need it at all.  Other than that, I'm OK with
this variant.

Hmm.. I think you need the RCU read lock even when you get the write_seqlock().

Yes, getting the seqlock for write implies that you get a spinlock and
in many normal circumstances that basically is equvalent to being
rcu-locked, but afaik in some configurations that is *not* sufficient
protection against an RCU grace period on another CPU. You need to do
a real rcu_read_lock that increments that whole rcu_read_lock_nesting
level, which a spinlock won't do.

And while the rename sequence lock protects against _renames_, it does
not protect against just plain dentries getting free'd under memory
pressure.

So I think the RCU-readlockness really needs to be independent of the
sequence lock.

 Linus


Yes, you are right. It will be safer to take rcu_read_lock() even if we 
are taking the rename_lock. It wasn't needed before as d_lock was taken. 
Will update the patch to take rcu_read_lock() out to reflect that.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/09/2013 01:29 PM, Al Viro wrote:

On Mon, Sep 09, 2013 at 12:18:13PM -0400, Waiman Long wrote:

+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * lock: sequence lock
+ * seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+   if (!(*seq&  1)) {  /* Even */
+   *seq = read_seqbegin(lock);
+   rcu_read_lock();
+   } else  /* Odd */
+   write_seqlock(lock);
+}
+static inline int read_seqretry_or_unlock(seqlock_t *lock, int *seq)
+{
+   if (!(*seq&  1)) {  /* Even */
+   rcu_read_unlock();
+   if (read_seqretry(lock, *seq)) {
+   (*seq)++;   /* Take writer lock */
+   return 1;
+   }
+   } else  /* Odd */
+   write_sequnlock(lock);
+   return 0;
+}

I'm not sure I like mixing rcu_read_lock() into that - d_path() and friends
can do that themselves just fine (it needs to be taken when seq is even),
and e.g. d_walk() doesn't need it at all.  Other than that, I'm OK with
this variant.


I think rcu_read_lock() is needed to make sure that the dentry won't be 
freed as we don't take d_lock now.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/09/2013 02:36 PM, Al Viro wrote:

On Mon, Sep 09, 2013 at 07:21:11PM +0100, Al Viro wrote:

Actually, it's better for prepend_path() as well, because it's actually

rcu_read_lock();
seq = read_seqbegin(&rename_lock);
again:

if (error)
goto done;

if (!seqretry_and_lock(&rename_lock, seq))
goto again; /* now as writer */
done:
seqretry_done(&rename_lock, seq);
rcu_read_unlock();

Posted variant will sometimes hit the following path:
* seq_readlock()
* start generating the output
* hit an error
[another process has taken and released rename_lock for some reason]
* hit read_seqretry_and_unlock(), which returns 1.
* retry everything with seq_writelock(), despite the error.

It's not too horrible (we won't be looping indefinitely, ignoring error
all along), but it's certainly subtle enough...

FWIW, what I propose is this (just the d_path-related parts):




I am fine with your proposed change as long as it gets the job done. It 
doesn't really matter if you do it or I do it.


Thank for taking the effort to make it better for us all.

-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/09/2013 03:28 PM, Al Viro wrote:

On Mon, Sep 09, 2013 at 08:10:29PM +0100, Al Viro wrote:

On Mon, Sep 09, 2013 at 02:46:57PM -0400, Waiman Long wrote:


I am fine with your proposed change as long as it gets the job done.

I suspect that the real problem is the unlock part of read_seqretry_or_unlock();
for d_walk() we want to be able to check if we need retry and continue walking
if we do not.  Let's do it that way: I've applied your patch as is, with the
next step being
* split read_seqretry_or_unlock():
need_seqretry() (return (!(seq&  1)&&  read_seqretry(lock, seq))
done_seqretry() (if (seq&  1) write_sequnlock(lock, seq)),
your if (read_seqretry_or_unlock(&rename_lock,&seq))
goto restart;
becoming
if (need_seqretry(&rename_lock, seq)) {
seq = 1;
goto restart;
}
done_seqretry(&rename_lock, seq);

Then d_walk() is trivially massaged to use of read_seqbegin_or_lock(),
need_seqretry() and done_seqretry().  Give me a few, I'll post it...

OK, how about this?  It splits read_seqretry_or_unlock(), takes
rcu_read_{lock,unlock} in the callers and converts d_walk() to those
primitives.  I've pushed that and your commit into vfs.git#experimental
(head at 48f5ec2, should propagate in a few); guys, please give it a look
and comment.


The changes look good to me. I was planning to take rcu_read_lock() out 
and doing something similar, but your change is good. BTW, I think Linus 
want to add some comments on why RCU lock is needed without the 
rename_lock, but I can put that in with a follow-up patch once the 
current change is merged.


Thank for your help and inspiration on this patch.

-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 1/1] dcache: Translating dentry into pathname without taking rename_lock

2013-09-09 Thread Waiman Long

On 09/09/2013 08:40 PM, George Spelvin wrote:

I'm really wondering about only trying once before taking the write lock.
Yes, using the lsbit is a cute hack, but are we using it for its cuteness
rather than its effectiveness?

Renames happen occasionally.  If that causes all the current pathname
translations to fall back to the write lock, that is fairly heavy.
Worse, all of those translations will (unnecessarily) bump the write
seqcount, triggering *other* translations to fail back to the write-lock
path.

One patch to fix this would be to have the fallback read algorithm take
sl->lock but *not* touch sl->seqcount, so it wouldn't break concurrent
readers.


Actually, a follow-up patch that I am planning to do is to introduce a 
read_seqlock() primitive in seqlock.h that does exactly that. Then the 
write_seqlock() in this patch will be modified to read_seqlock().


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: kernel BUG at fs/dcache.c:648! with v3.11-7890-ge5c832d

2013-09-10 Thread Waiman Long

On 09/10/2013 04:25 PM, Linus Torvalds wrote:

On Tue, Sep 10, 2013 at 12:57 PM, Mace Moneta  wrote:

The (first) patch looks good; no recurrence. It has only taken 3-5 minutes
before, and I've been up for about half an hour now.

Ok, good. It's pushed out.

Al, your third pile of VFS stuff is also merged. Waiman, that means
that your RCU path creation stuff is in. What else did you have
pending for scalability?

 Linus


I need to clean up some comments in the code. The other thing that I 
want to do is to introduce read_seqlock/read_sequnlock() primitives that 
do the locking without incrementing the sequence number. Then all the 
name lookup and translation code can use the new primitives as they 
don't change any of the protected structures. This will prevent one 
sequence number check failure from cascading into a series of failures 
because of the sequence number change. I will have a patch ready by 
tomorrow morning.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] dcache: use read_seqlock/unlock() in read_seqbegin_or_lock() & friend

2013-09-11 Thread Waiman Long
This patch modifies read_seqbegin_or_lock() and need_seqretry() to
use newly introduced read_seqlock() and read_sequnlock() primitives
so that they won't change the sequence number even if they fall back
to take the lock.  This is OK as no change to the protected data
structure is being made. It will prevent one fallback to lock taking
from cascading into a series of lock taking reducing performance
because of the sequence number change. It will also allow other
sequence reader to go ahead while a reader lock is taken.

This patch also updates some of the inaccurate comments in the code.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |   31 ---
 1 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 4d9df3c..8191ca5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -90,8 +90,8 @@ static struct kmem_cache *dentry_cache __read_mostly;
 
 /**
  * read_seqbegin_or_lock - begin a sequence number check or locking block
- * lock: sequence lock
- * seq : sequence number to be checked
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
  *
  * First try it once optimistically without taking the lock. If that fails,
  * take the lock. The sequence number is also used as a marker for deciding
@@ -103,7 +103,7 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, 
int *seq)
if (!(*seq & 1))/* Even */
*seq = read_seqbegin(lock);
else/* Odd */
-   write_seqlock(lock);
+   read_seqlock(lock);
 }
 
 static inline int need_seqretry(seqlock_t *lock, int seq)
@@ -114,7 +114,7 @@ static inline int need_seqretry(seqlock_t *lock, int seq)
 static inline void done_seqretry(seqlock_t *lock, int seq)
 {
if (seq & 1)
-   write_sequnlock(lock);
+   read_sequnlock(lock);
 }
 
 /*
@@ -2673,9 +2673,9 @@ static int prepend(char **buffer, int *buflen, const char 
*str, int namelen)
 
 /**
  * prepend_name - prepend a pathname in front of current buffer pointer
- * buffer: buffer pointer
- * buflen: allocated length of the buffer
- * name:   name string and length qstr structure
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name:   name string and length qstr structure
  *
  * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
  * make sure that either the old or the new name pointer and length are
@@ -2713,14 +2713,15 @@ static int prepend_name(char **buffer, int *buflen, 
struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * The function tries to write out the pathname without taking any lock other
- * than the RCU read lock to make sure that dentries won't go away. It only
- * checks the sequence number of the global rename_lock as any change in the
- * dentry's d_seq will be preceded by changes in the rename_lock sequence
- * number. If the sequence number had been change, it will restart the whole
- * pathname back-tracing sequence again. It performs a total of 3 trials of
- * lockless back-tracing sequences before falling back to take the
- * rename_lock.
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
  */
 static int prepend_path(const struct path *path,
const struct path *root,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] seqlock: Add a new blocking reader type

2013-09-11 Thread Waiman Long
The sequence lock (seqlock) was originally designed for the cases
where the readers do not need to block the writers by making the
readers retry the read operation when the data change.

Since then, the use cases have been expanded to include situations
where a thread does not need to change the data (effectively a reader)
at all but have to take the writer lock because it can't tolerate
changes to the protected structure. Some examples are the d_path()
function and the getcwd() syscall in fs/dcache.c where the functions
take the writer lock on rename_lock even though they don't need
to change anything in the protected data structure at all. This is
inefficient as a reader is now blocking other non-blocking readers
by pretending to be a writer.

This patch tries to eliminate this inefficiency by introducing a new
type of blocking reader to the seqlock locking mechanism. This new
blocking reader will not block other non-blocking readers, but will
block other blocking readers and writers.

Signed-off-by: Waiman Long 
---
 include/linux/seqlock.h |   65 +++---
 1 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1829905..26be0d9 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,15 +3,18 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. There are two types
+ * of readers:
+ * 1. Non-blocking readers which never block but they may have to retry if
+ *a writer is in progress. Writers do not wait for non-blocking readers.
+ * 2. Blocking readers which will block if a writer is in progress. A
+ *blocking reader in progress will also block a writer.
  *
- * This is not as cache friendly as brlock. Also, this will not work
+ * This is not as cache friendly as brlock. Also, this may not work well
  * for data that contains pointers, because any writer could
  * invalidate a pointer that a reader was following.
  *
- * Expected reader usage:
+ * Expected non-blocking reader usage:
  * do {
  * seq = read_seqbegin(&foo);
  * ...
@@ -268,4 +271,56 @@ write_sequnlock_irqrestore(seqlock_t *sl, unsigned long 
flags)
spin_unlock_irqrestore(&sl->lock, flags);
 }
 
+/*
+ * The blocking reader lock out other writers, but doesn't update the count.
+ * Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
+ */
+static inline void read_seqlock(seqlock_t *sl)
+{
+   spin_lock(&sl->lock);
+}
+
+static inline void read_sequnlock(seqlock_t *sl)
+{
+   spin_unlock(&sl->lock);
+}
+
+static inline void read_seqlock_bh(seqlock_t *sl)
+{
+   spin_lock_bh(&sl->lock);
+}
+
+static inline void read_sequnlock_bh(seqlock_t *sl)
+{
+   spin_unlock_bh(&sl->lock);
+}
+
+static inline void read_seqlock_irq(seqlock_t *sl)
+{
+   spin_lock_irq(&sl->lock);
+}
+
+static inline void read_sequnlock_irq(seqlock_t *sl)
+{
+   spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __read_seqlock_irqsave(seqlock_t *sl)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&sl->lock, flags);
+   return flags;
+}
+
+#define read_seqlock_irqsave(lock, flags)  \
+   do { flags = __read_seqlock_irqsave(lock); } while (0)
+
+static inline void
+read_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+   spin_unlock_irqrestore(&sl->lock, flags);
+}
+
 #endif /* __LINUX_SEQLOCK_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] seqlock: Add a new blocking reader type

2013-09-11 Thread Waiman Long

On 09/11/2013 10:55 AM, Al Viro wrote:

On Wed, Sep 11, 2013 at 10:28:26AM -0400, Waiman Long wrote:

The sequence lock (seqlock) was originally designed for the cases
where the readers do not need to block the writers by making the
readers retry the read operation when the data change.

Since then, the use cases have been expanded to include situations
where a thread does not need to change the data (effectively a reader)
at all but have to take the writer lock because it can't tolerate
changes to the protected structure. Some examples are the d_path()
function and the getcwd() syscall in fs/dcache.c where the functions
take the writer lock on rename_lock even though they don't need
to change anything in the protected data structure at all. This is
inefficient as a reader is now blocking other non-blocking readers
by pretending to be a writer.

This patch tries to eliminate this inefficiency by introducing a new
type of blocking reader to the seqlock locking mechanism. This new
blocking reader will not block other non-blocking readers, but will
block other blocking readers and writers.

Umm...  That's misleading - it doesn't _block_, it spins.  Moroever,
seq_readbegin() also spins in presense of writer; the main property
of this one is that it keeps writers away.


I used "block" in the sense that it will stop a writer from moving 
forward. I will update the commit log to make that more clear.

Folks, any suggestions on better names?  The semantics we are getting is


I will welcome any better name suggestion and will incorporate that in 
the patch.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] seqlock: Add a new blocking reader type

2013-09-11 Thread Waiman Long

On 09/11/2013 01:26 PM, Al Viro wrote:

On Wed, Sep 11, 2013 at 12:33:35PM -0400, Waiman Long wrote:


Folks, any suggestions on better names?  The semantics we are getting is

I will welcome any better name suggestion and will incorporate that
in the patch.

FWIW, the suggestions I've seen so far had been

seq_exreadlock() [ex for exclusive]
seq_exclreadlock() [ditto, and IMO fails the "easily read over the phone"
test - /sekv-excre...ARRGH/]
seq_prot_readlock() [prot for protected, as in DLM protected read]


Following the naming convention in seqlock.h that all functions begin 
with read_ or write_, is read_seqexcl_lock() look OK to you?


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [3.12-rc1] Dependency on module-init-tools >= 3.11 ?

2013-09-12 Thread Waiman Long

On 09/12/2013 06:29 AM, Herbert Xu wrote:

On Thu, Sep 12, 2013 at 07:20:23PM +0900, Tetsuo Handa wrote:

Herbert Xu wrote:

The trouble is not all distros will include the softdep modules in
the initramfs.  So for now I think we will have to live with a fallback.

I see.

Herbert Xu wrote:

OK, can you please try this patch on top of the current tree?

This way at least you'll have a working system until your initramfs
tool is fixed to do the right thing.

I tested the patch and confirmed that the boot failure was solved.

But arch/x86/crypto/crct10dif-pclmul.ko is not included into initramfs and
therefore we cannot benefit from PCLMULQDQ version.

That is expected and is also the status quo.  So once the initrd
generation tool is fixed to include softdeps it will work properly.

Thanks!


I would like to report that I also have the same boot problem on a 
RHEL6.4 box with the crypto patch. My workaround is to force kernel 
build to have the crc_t10dif code built-in by changing the config file:


4889c4889
< CONFIG_CRYPTO_CRCT10DIF=m
---
> CONFIG_CRYPTO_CRCT10DIF=y
5002c5002
< CONFIG_CRC_T10DIF=m
---
> CONFIG_CRC_T10DIF=y

This solved the boot problem without any additional patch.  Do you think 
you should consider changing the configuration default to "y" instead of 
"m" or doesn't allow the "m" option at all?


Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2 v2] dcache: get/release read lock in read_seqbegin_or_lock() & friend

2013-09-12 Thread Waiman Long
Change log
--
v1->v2:
  - Rename the new seqlock primitives to read_seqexcl_lock* and
read_seqexcl_unlock*.
  - Clarify in the commit log and comments about the exclusive nature
of the read lock.

Waiman Long (2):
  seqlock: Add a new locking reader type
  dcache: get/release read lock in read_seqbegin_or_lock() & friend

 fs/dcache.c |   31 +++--
 include/linux/seqlock.h |   68 +++---
 2 files changed, 79 insertions(+), 20 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 v2] dcache: get/release read lock in read_seqbegin_or_lock() & friend

2013-09-12 Thread Waiman Long
This patch modifies read_seqbegin_or_lock() and need_seqretry() to
use newly introduced read_seqexcl_lock() and read_seqexcl_unlock()
primitives so that they won't change the sequence number even if
they fall back to take the lock.  This is OK as no change to the
protected data structure is being made. It will prevent one fallback
to lock taking from cascading into a series of lock taking reducing
performance because of the sequence number change. It will also allow
other sequence readers to go forward while an exclusive reader lock
is taken.

This patch also updates some of the inaccurate comments in the code.

Signed-off-by: Waiman Long 
---
 fs/dcache.c |   31 ---
 1 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 4d9df3c..9e88367 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -90,8 +90,8 @@ static struct kmem_cache *dentry_cache __read_mostly;
 
 /**
  * read_seqbegin_or_lock - begin a sequence number check or locking block
- * lock: sequence lock
- * seq : sequence number to be checked
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
  *
  * First try it once optimistically without taking the lock. If that fails,
  * take the lock. The sequence number is also used as a marker for deciding
@@ -103,7 +103,7 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, 
int *seq)
if (!(*seq & 1))/* Even */
*seq = read_seqbegin(lock);
else/* Odd */
-   write_seqlock(lock);
+   read_seqexcl_lock(lock);
 }
 
 static inline int need_seqretry(seqlock_t *lock, int seq)
@@ -114,7 +114,7 @@ static inline int need_seqretry(seqlock_t *lock, int seq)
 static inline void done_seqretry(seqlock_t *lock, int seq)
 {
if (seq & 1)
-   write_sequnlock(lock);
+   read_seqexcl_unlock(lock);
 }
 
 /*
@@ -2673,9 +2673,9 @@ static int prepend(char **buffer, int *buflen, const char 
*str, int namelen)
 
 /**
  * prepend_name - prepend a pathname in front of current buffer pointer
- * buffer: buffer pointer
- * buflen: allocated length of the buffer
- * name:   name string and length qstr structure
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name:   name string and length qstr structure
  *
  * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
  * make sure that either the old or the new name pointer and length are
@@ -2713,14 +2713,15 @@ static int prepend_name(char **buffer, int *buflen, 
struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * The function tries to write out the pathname without taking any lock other
- * than the RCU read lock to make sure that dentries won't go away. It only
- * checks the sequence number of the global rename_lock as any change in the
- * dentry's d_seq will be preceded by changes in the rename_lock sequence
- * number. If the sequence number had been change, it will restart the whole
- * pathname back-tracing sequence again. It performs a total of 3 trials of
- * lockless back-tracing sequences before falling back to take the
- * rename_lock.
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
  */
 static int prepend_path(const struct path *path,
const struct path *root,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 v2] seqlock: Add a new locking reader type

2013-09-12 Thread Waiman Long
The sequence lock (seqlock) was originally designed for the cases
where the readers do not need to block the writers by making the
readers retry the read operation when the data change.

Since then, the use cases have been expanded to include situations
where a thread does not need to change the data (effectively a reader)
at all but have to take the writer lock because it can't tolerate
changes to the protected structure. Some examples are the d_path()
function and the getcwd() syscall in fs/dcache.c where the functions
take the writer lock on rename_lock even though they don't need
to change anything in the protected data structure at all. This is
inefficient as a reader is now blocking other sequence number reading
readers from moving forward by pretending to be a writer.

This patch tries to eliminate this inefficiency by introducing a new
type of locking reader to the seqlock locking mechanism. This new
locking reader will try to take an exclusive lock preventing other
writers and locking readers from going forward. However, it won't
affect the progress of the other sequence number reading readers as
the sequence number won't be changed.

Signed-off-by: Waiman Long 
---
 include/linux/seqlock.h |   68 +++---
 1 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1829905..9bd84b5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -3,15 +3,21 @@
 /*
  * Reader/writer consistent mechanism without starving writers. This type of
  * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes.  Readers never
- * block but they may have to retry if a writer is in
- * progress. Writers do not wait for readers. 
+ * and is willing to retry if the information changes. There are two types
+ * of readers:
+ * 1. Sequence readers which never block a writer but they may have to retry
+ *if a writer is in progress by detecting change in sequence number.
+ *Writers do not wait for a sequence reader.
+ * 2. Locking readers which will wait if a writer or another locking reader
+ *is in progress. A locking reader in progress will also block a writer
+ *from going forward. Unlike the regular rwlock, the read lock here is
+ *exclusive so that only one locking reader can get it.
  *
- * This is not as cache friendly as brlock. Also, this will not work
+ * This is not as cache friendly as brlock. Also, this may not work well
  * for data that contains pointers, because any writer could
  * invalidate a pointer that a reader was following.
  *
- * Expected reader usage:
+ * Expected non-blocking reader usage:
  * do {
  * seq = read_seqbegin(&foo);
  * ...
@@ -268,4 +274,56 @@ write_sequnlock_irqrestore(seqlock_t *sl, unsigned long 
flags)
spin_unlock_irqrestore(&sl->lock, flags);
 }
 
+/*
+ * A locking reader exclusively locks out other writers and locking readers,
+ * but doesn't update the sequence number. Acts like a normal spin_lock/unlock.
+ * Don't need preempt_disable() because that is in the spin_lock already.
+ */
+static inline void read_seqexcl_lock(seqlock_t *sl)
+{
+   spin_lock(&sl->lock);
+}
+
+static inline void read_seqexcl_unlock(seqlock_t *sl)
+{
+   spin_unlock(&sl->lock);
+}
+
+static inline void read_seqexcl_lock_bh(seqlock_t *sl)
+{
+   spin_lock_bh(&sl->lock);
+}
+
+static inline void read_seqexcl_unlock_bh(seqlock_t *sl)
+{
+   spin_unlock_bh(&sl->lock);
+}
+
+static inline void read_seqexcl_lock_irq(seqlock_t *sl)
+{
+   spin_lock_irq(&sl->lock);
+}
+
+static inline void read_seqexcl_unlock_irq(seqlock_t *sl)
+{
+   spin_unlock_irq(&sl->lock);
+}
+
+static inline unsigned long __read_seqexcl_lock_irqsave(seqlock_t *sl)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&sl->lock, flags);
+   return flags;
+}
+
+#define read_seqexcl_lock_irqsave(lock, flags) \
+   do { flags = __read_seqexcl_lock_irqsave(lock); } while (0)
+
+static inline void
+read_seqexcl_unlock_irqrestore(seqlock_t *sl, unsigned long flags)
+{
+   spin_unlock_irqrestore(&sl->lock, flags);
+}
+
 #endif /* __LINUX_SEQLOCK_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/2 v2] dcache: get/release read lock in read_seqbegin_or_lock() & friend

2013-09-12 Thread Waiman Long

On 09/12/2013 12:38 PM, Linus Torvalds wrote:

On Thu, Sep 12, 2013 at 7:55 AM, Waiman Long  wrote:

Change log
--
v1->v2:
   - Rename the new seqlock primitives to read_seqexcl_lock* and
 read_seqexcl_unlock*.

Applied. Except I peed in the snow and renamed the functions
again.That whole "seqexcl" looked too odd to me. It not only looks a
bit too much like random noise, but it makes it seem a whole different
lock from the "seqlock" thing.

I wanted to pattern the name after "write_seq[un]lock()", since it
most resembles that (not just in implementation, but in usage: the
traditional read-sequence isn't a lock, it's a begin/retry sequence,
so the usage pattern is totally different too, and the naming is
different).

I ended up picking "read_seq[un]lock_excl()". I could have gone with
"excl_" as a prefix too, I guess. Whatever. Now the "_excl" thing
looks a bit like the "_bh"/"_irqX" context modifier, and I think it
matches our normal lock naming pattern better.

 Linus


I think your new names are better than mine. I am not good at naming 
stuff. Thank for the merge and the rename.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/2 v2] dcache: get/release read lock in read_seqbegin_or_lock() & friend

2013-09-12 Thread Waiman Long

On 09/12/2013 01:30 PM, Linus Torvalds wrote:

On Thu, Sep 12, 2013 at 9:38 AM, Linus Torvalds
  wrote:

On Thu, Sep 12, 2013 at 7:55 AM, Waiman Long  wrote:

Change log
--
v1->v2:
   - Rename the new seqlock primitives to read_seqexcl_lock* and
 read_seqexcl_unlock*.

Applied.

Btw, when I tried to benchmark this, I failed miserably.

Why?


This patch is just a safety guard to prevent occasional bad performance 
because of some bad timing. It will not improve performance for many 
cases because the seqbegin/seqretry sequence succeeds without actual retry.



If you do a threaded benchmark of "getcwd()", you end up spending all
your time in a spinlock anyway: get_fs_root_and_pwd() takes the
fs->lock to get the root/pwd.


I am aware that there is another spinlock bottleneck in the fs struct 
for getcwd().



Now, AIM7 probably uses processes, not threads, so you don't see this,
and maybe I shouldn't care. But looking at it, it annoys me
enormously, because the whole get_fs_root_and_pwd() is just stupid.


AIM7 don't do much getcwd() calls, so it is not a real bottleneck for 
the benchmark. The lockref patch boosts the short workload performance. 
The  prepend_path patch was to fix the incorrect perf record data as 
perf makes heavy use of d_path(). The change made to getcwd() was just a 
side benefit. But then it still have other spinlock bottleneck.



Putting it all under the RCU lock and then changing it to use
get_fs_root_and_pwd_rcu() that just uses the fs->seq sequence
read-lock looks absolutely trivial.


Yes, I think we can do something similar for this. I will take a look to 
see how it can be fixed.


-Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: Fix potential compilation error with some compilers

2013-10-08 Thread Waiman Long
The building of the perf tool failed in a SLES11 sp3 system with the
following compilation error:

cc1: warnings being treated as errors
util/scripting-engines/trace-event-perl.c: In function
‘perl_process_tracepoint’:
util/scripting-engines/trace-event-perl.c:285: error: format ‘%lu’
expects type ‘long unsigned int’, but argument 2 has type ‘__u64’

This patch replaces PRIu64 which is "lu" by the explicit "llu" to
fix this problem as __u64 is of type "long long unsigned".

Signed-off-by: Waiman Long 
---
 .../perf/util/scripting-engines/trace-event-perl.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c 
b/tools/perf/util/scripting-engines/trace-event-perl.c
index a85e4ae..d6eb9c5 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -281,8 +281,12 @@ static void perl_process_tracepoint(union perf_event 
*perf_event __maybe_unused,
return;
 
event = find_cache_event(evsel);
+   /*
+* attr.config is a __u64 which requires "%llu" to avoid compilation
+* error/warning with some compilers.
+*/
if (!event)
-   die("ug! no event found for type %" PRIu64, evsel->attr.config);
+   die("ug! no event found for type %llu", evsel->attr.config);
 
pid = raw_field_value(event, "common_pid", data);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-29 Thread Waiman Long

On 08/29/2013 07:42 PM, Linus Torvalds wrote:
Waiman? Mind looking at this and testing? Linus 


Sure, I will try out the patch tomorrow morning and see how it works out 
for my test case.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC v2 1/2] qspinlock: Introducing a 4-byte queue spinlock implementation

2013-08-29 Thread Waiman Long

On 08/29/2013 01:03 PM, Alexander Fyodorov wrote:

29.08.2013, 19:25, "Waiman Long":

What I have been thinking is to set a flag in an architecture specific
header file to tell if the architecture need a memory barrier. The
generic code will then either do a smp_mb() or barrier() depending on
the presence or absence of the flag. I would prefer to do more in the
generic code, if possible.

If you use flag then you'll have to check it manually. It is better to add new 
smp_mb variant, I suggest calling it smp_mb_before_store(), and define it to 
barrier() on x86.


I am sorry that I was not clear in my previous mail. I mean a flag/macro 
for compile time checking rather than doing runtime checking.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-30 Thread Waiman Long

On 08/29/2013 11:54 PM, Linus Torvalds wrote:

On Thu, Aug 29, 2013 at 8:12 PM, Waiman Long  wrote:

On 08/29/2013 07:42 PM, Linus Torvalds wrote:

Waiman? Mind looking at this and testing? Linus

Sure, I will try out the patch tomorrow morning and see how it works out for
my test case.

Ok, thanks, please use this slightly updated pCMPXCHG_LOOPatch attached here.




I tested your patch on a 2-socket (12 cores, 24 threads) DL380 with 
2.9GHz Westmere-EX CPUs, the test results of your test program (max 
threads increased to 24 to match the thread count) were:


with patch = 68M
w/o patch = 12M

So it was an almost 6X improvement. I think that is really good. A 
dual-socket machine, these days, shouldn't be considered as a "BIG" 
machine. They are pretty common in different organizations.


I have reviewed the patch, and it looks good to me with the exception 
that I added a cpu_relax() call at the end of while loop in the 
CMPXCHG_LOOP macro.


I also got the perf data of the test runs with and without the patch.

With patch:

 29.24%a.out  [kernel.kallsyms][k] lockref_get_or_lock
 19.65%a.out  [kernel.kallsyms][k] lockref_put_or_lock
 14.11%a.out  [kernel.kallsyms][k] dput
  5.37%a.out  [kernel.kallsyms][k] __d_lookup_rcu
  5.29%a.out  [kernel.kallsyms][k] lg_local_lock
  4.59%a.out  [kernel.kallsyms][k] d_rcu_to_refcount
:
  0.13%a.out  [kernel.kallsyms][k] complete_walk
:
  0.01%a.out  [kernel.kallsyms][k] _raw_spin_lock

Without patch:

 93.50%a.out  [kernel.kallsyms][k] _raw_spin_lock
  0.96%a.out  [kernel.kallsyms][k] dput
  0.80%a.out  [kernel.kallsyms][k] kmem_cache_free
  0.75%a.out  [kernel.kallsyms][k] lg_local_lock
  0.48%a.out  [kernel.kallsyms][k] complete_walk
  0.45%a.out  [kernel.kallsyms][k] __d_lookup_rcu

For the other test cases that I am interested in, like the AIM7 
benchmark, your patch may not be as good as my original one. I got 1-3M 
JPM (varied quite a lot in different runs) in the short workloads on a 
80-core system. My original one got 6M JPM. However, the test was done 
on 3.10 based kernel. So I need to do more test to see if that has an 
effect on the JPM results.


Anyway, I think this patch is good performance-wise. I remembered that 
awhile ago that an internal reported a lock contention problem in dentry 
involving probably complete_walk(). This patch will certainly help for 
that case.


I will do more investigation to see how to make this patch work better 
for my test cases.


Thank for taking the effort in optimizing the complete_walk() and 
unlazy_walk() function that are not in my original patch. That will make 
the patch work even better under more circumstances. I really appreciate 
that.


Best regards,
Longman



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-30 Thread Waiman Long

On 08/30/2013 02:53 PM, Linus Torvalds wrote:
So the perf data would be *much* more interesting for a more varied 
load. I know pretty much exactly what happens with my silly 
test-program, and as you can see it never really gets to the actual 
spinlock, because that test program will only ever hit the fast-path 
case. It would be much more interesting to see another load that may 
trigger the d_lock actually being taken. So:

For the other test cases that I am interested in, like the AIM7 benchmark,
your patch may not be as good as my original one. I got 1-3M JPM (varied
quite a lot in different runs) in the short workloads on a 80-core system.
My original one got 6M JPM. However, the test was done on 3.10 based kernel.
So I need to do more test to see if that has an effect on the JPM results.

I'd really like to see a perf profile of that, particularly with some
call chain data for the relevant functions (ie "what it is that causes
us to get to spinlocks"). Because it may well be that you're hitting
some of the cases that I didn't see, and thus didn't notice.

In particular, I suspect AIM7 actually creates/deletes files and/or
renames them too. Or maybe I screwed up the dget_parent() special case
thing, which mattered because AIM7 did a lot of getcwd() calls or
someting odd like that.

 Linus


Below is the perf data of my short workloads run in an 80-core DL980:

13.60%reaim  [kernel.kallsyms][k] 
_raw_spin_lock_irqsave

 |--48.79%-- tty_ldisc_try
 |--48.58%-- tty_ldisc_deref
  --2.63%-- [...]

11.31%  swapper  [kernel.kallsyms][k] intel_idle
   |--99.94%-- cpuidle_enter_state
--0.06%-- [...]

 4.86%reaim  [kernel.kallsyms][k] lg_local_lock
 |--59.41%-- mntput_no_expire
 |--19.37%-- path_init
 |--15.14%-- d_path
 |--5.88%-- sys_getcwd
  --0.21%-- [...]

 3.00%reaim  reaim[.] mul_short

 2.41%reaim  reaim[.] mul_long
 |--87.21%-- 0xbc614e
  --12.79%-- (nil)

 2.29%reaim  reaim[.] mul_int

 2.20%reaim  [kernel.kallsyms][k] _raw_spin_lock
 |--12.81%-- prepend_path
 |--9.90%-- lockref_put_or_lock
 |--9.62%-- __rcu_process_callbacks
 |--8.77%-- load_balance
 |--6.40%-- lockref_get
 |--5.55%-- __mutex_lock_slowpath
 |--4.85%-- __mutex_unlock_slowpath
 |--4.83%-- inet_twsk_schedule
 |--4.27%-- lockref_get_or_lock
 |--2.19%-- task_rq_lock
 |--2.13%-- sem_lock
 |--2.09%-- scheduler_tick
 |--1.88%-- try_to_wake_up
 |--1.53%-- kmem_cache_free
 |--1.30%-- unix_create1
 |--1.22%-- unix_release_sock
 |--1.21%-- process_backlog
 |--1.11%-- unix_stream_sendmsg
 |--1.03%-- enqueue_to_backlog
 |--0.85%-- rcu_accelerate_cbs
 |--0.79%-- unix_dgram_sendmsg
 |--0.76%-- do_anonymous_page
 |--0.70%-- unix_stream_recvmsg
 |--0.69%-- unix_stream_connect
 |--0.64%-- net_rx_action
 |--0.61%-- tcp_v4_rcv
 |--0.59%-- __do_fault
 |--0.54%-- new_inode_pseudo
 |--0.52%-- __d_lookup
  --10.62%-- [...]

 1.19%reaim  [kernel.kallsyms][k] mspin_lock
 |--99.82%-- __mutex_lock_slowpath
  --0.18%-- [...]

 1.01%reaim  [kernel.kallsyms][k] lg_global_lock
 |--51.62%-- __shmdt
  --48.38%-- __shmctl

There are more contention in the lglock than I remember for the run in 
3.10. This is an area that I need to look at. In fact, lglock is 
becoming a problem for really large machine with a lot of cores. We have 
a prototype 16-socket machine with 240 cores under development. The cost 
of doing a lg_global_lock will be very high in that type of machine 
given that it is already high in this 80-core machine. I have been 
thinking about instead of per-cpu spinlocks, we could change the locking 
to per-node level. While there will be more contention for 
lg_local_lock, the cost of doing a lg_global_lock will be much

Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-30 Thread Waiman Long

On 08/30/2013 03:40 PM, Al Viro wrote:

On Fri, Aug 30, 2013 at 03:20:48PM -0400, Waiman Long wrote:


There are more contention in the lglock than I remember for the run
in 3.10. This is an area that I need to look at. In fact, lglock is
becoming a problem for really large machine with a lot of cores. We
have a prototype 16-socket machine with 240 cores under development.
The cost of doing a lg_global_lock will be very high in that type of
machine given that it is already high in this 80-core machine. I
have been thinking about instead of per-cpu spinlocks, we could
change the locking to per-node level. While there will be more
contention for lg_local_lock, the cost of doing a lg_global_lock
will be much lower and contention within the local die should not be
too bad. That will require either a per-node variable infrastructure
or simulated with the existing per-cpu subsystem.

Speaking of lglock, there's a low-hanging fruit in that area: we have
no reason whatsoever to put anything but regular files with FMODE_WRITE
on the damn per-superblock list - the *only* thing it's used for is
mark_files_ro(), which will skip everything except those.  And since
read opens normally outnumber the writes quite a bit...  Could you
try the diff below and see if it changes the picture?  files_lglock
situation ought to get better...




Sure. I will try that out, but it probably won't help too much in this 
test case. The perf profile that I sent out in my previous mail is only 
partial. The actual one for lg_global_lock was:


 1.01%reaim  [kernel.kallsyms][k] lg_global_lock
  |
  --- lg_global_lock
  mntput_no_expire
  mntput
  __fput
  fput
  task_work_run
  do_notify_resume
  int_signal
 |
 |--51.62%-- __shmdt
 |
  --48.38%-- __shmctl

So it is the mnput_no_expire() function that is doing all the 
lg_global_lock() calls.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7 1/4] spinlock: A new lockref structure for lockless update of refcount

2013-08-30 Thread Waiman Long

On 08/30/2013 03:33 PM, Linus Torvalds wrote:

On Fri, Aug 30, 2013 at 12:20 PM, Waiman Long  wrote:

Below is the perf data of my short workloads run in an 80-core DL980:

Ok, that doesn't look much like d_lock any more. Sure, there's a small
amount of spinlocking going on with lockref being involved, but on the
whole even that looks more like getcwd and other random things.


Yes, d_lock contention isn't a major one in the perf profile. However, 
sometimes a small improvement can lead to a noticeable improvement in 
performance.

I do agree that getcwd() can probably be hugely optimized. Nobody has
ever bothered, because it's never really performance-critical, and I
think AIM7 ends up just doing something really odd. I bet we could fix
it entirely if we cared enough.

The prepend_path() isn't all due to getcwd. The correct profile should be


|--12.81%-- prepend_path
 |  |
 |  |--67.35%-- d_path
 |  |  |
 |  |  |--60.72%-- 
proc_pid_readlink

 |  |  |  sys_readlinkat
 |  |  |  sys_readlink
 |  |  |  
system_call_fastpath

 |  |  |  __GI___readlink
 |  |  |  0x302f64662f666c
 |  |  |
 |  |   --39.28%-- 
perf_event_mmap_event

 |  |
 |   --32.65%-- sys_getcwd
 | system_call_fastpath
 | __getcwd

Yes, the perf subsystem itself can contribute a sizeable portion of the 
spinlock contention. In fact, I have also applied my seqlock patch that 
was sent a while ago to the test kernel in order to get a more accurate 
perf profile. The seqlock patch will allow concurrent d_path() calls 
without one blocking the others. In the 240-core prototype machine, it 
was not possible to get an accurate perf profile for some workloads 
because more than 50% of the time was spent in spinlock contention due 
to the use of perf. An accurate perf profile can only be obtained in 
those cases by applying my lockref and seqlock patches. I hope someone 
will have the time to review my seqlock patch to see what additional 
changes will be needed. I really like to see it merged in some form to 
3.12.



I just wonder if it's even worth it (I assume AIM7 is something HP
uses internally, because I've never really heard of anybody else
caring)


Our performance group is actually pretty new. It was formed 2 years ago 
and we began actively participating in the Linux kernel development just 
in the past year.


We use the AIM7 benchmark internally primarily because it is easy to run 
and cover quite a lot of different areas in the kernel. We are also 
using specJBB and SwingBench for performance benchmarking problem. We 
are also trying to look for more benchmarks to use in the future.


Regards,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   5   6   7   8   9   10   >