[PATCH] arch/powerpc: use BUILD_BUG() when detect unfit {cmp}xchg, size

2016-02-23 Thread Pan Xinhui
From: pan xinhui 

__xchg_called_with_bad_pointer() can't tell us what codes use {cmp}xchg
in incorrect way.  And no error will be reported until the link stage.
To fix such kinds of issues in a easy way, we use BUILD_BUG() here.

Signed-off-by: pan xinhui 
---
 arch/powerpc/include/asm/cmpxchg.h | 19 +--
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index d1a8d93..20c0a30 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Atomic exchange
@@ -92,12 +93,6 @@ __xchg_u64_local(volatile void *p, unsigned long val)
 }
 #endif
 
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid xchg().
- */
-extern void __xchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __xchg(volatile void *ptr, unsigned long x, unsigned int size)
 {
@@ -109,7 +104,7 @@ __xchg(volatile void *ptr, unsigned long x, unsigned int 
size)
return __xchg_u64(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG();
return x;
 }
 
@@ -124,7 +119,7 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned 
int size)
return __xchg_u64_local(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG();
return x;
 }
 #define xchg(ptr,x) \
@@ -235,10 +230,6 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned 
long old,
 }
 #endif
 
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
  unsigned int size)
@@ -251,7 +242,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned 
long new,
return __cmpxchg_u64(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG();
return old;
 }
 
@@ -267,7 +258,7 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, 
unsigned long new,
return __cmpxchg_u64_local(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG();
return old;
 }
 
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V2] arch/powerpc: use BUILD_BUG_ON_MSG() when detect unfit {cmp}xchg size

2016-02-23 Thread Pan Xinhui
From: pan xinhui 

__xchg_called_with_bad_pointer() can't tell us what codes use {cmp}xchg
in incorrect way.  And no error will be reported until the link stage.
To fix such a kind of issues easily, we use BUILD_BUG_ON_MSG() here.

Signed-off-by: pan xinhui 
---
change from V1:
use BUILD_BUG_ON_MSG to provide more build error messages.
---
 arch/powerpc/include/asm/cmpxchg.h | 19 +--
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index d1a8d93..730028b 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Atomic exchange
@@ -92,12 +93,6 @@ __xchg_u64_local(volatile void *p, unsigned long val)
 }
 #endif
 
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid xchg().
- */
-extern void __xchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __xchg(volatile void *ptr, unsigned long x, unsigned int size)
 {
@@ -109,7 +104,7 @@ __xchg(volatile void *ptr, unsigned long x, unsigned int 
size)
return __xchg_u64(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg");
return x;
 }
 
@@ -124,7 +119,7 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned 
int size)
return __xchg_u64_local(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
return x;
 }
 #define xchg(ptr,x) \
@@ -235,10 +230,6 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned 
long old,
 }
 #endif
 
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
  unsigned int size)
@@ -251,7 +242,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned 
long new,
return __cmpxchg_u64(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
return old;
 }
 
@@ -267,7 +258,7 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, 
unsigned long new,
return __cmpxchg_u64_local(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_local");
return old;
 }
 
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V2] arch/powerpc: use BUILD_BUG_ON_MSG() when detect unfit {cmp}xchg size

2016-02-23 Thread Pan Xinhui
From: pan xinhui 

__xchg_called_with_bad_pointer() can't tell us what codes use {cmp}xchg
in incorrect way.  And no error will be reported until the link stage.
To fix such a kind of issues easily, we use BUILD_BUG_ON_MSG() here.

Signed-off-by: pan xinhui 
---
change from V1:
use BUILD_BUG_ON_MSG to provide more build error messages.
---
 arch/powerpc/include/asm/cmpxchg.h | 19 +--
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index d1a8d93..730028b 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Atomic exchange
@@ -92,12 +93,6 @@ __xchg_u64_local(volatile void *p, unsigned long val)
 }
 #endif
 
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid xchg().
- */
-extern void __xchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __xchg(volatile void *ptr, unsigned long x, unsigned int size)
 {
@@ -109,7 +104,7 @@ __xchg(volatile void *ptr, unsigned long x, unsigned int 
size)
return __xchg_u64(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg");
return x;
 }
 
@@ -124,7 +119,7 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned 
int size)
return __xchg_u64_local(ptr, x);
 #endif
}
-   __xchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
return x;
 }
 #define xchg(ptr,x) \
@@ -235,10 +230,6 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned 
long old,
 }
 #endif
 
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
  unsigned int size)
@@ -251,7 +242,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned 
long new,
return __cmpxchg_u64(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
return old;
 }
 
@@ -267,7 +258,7 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, 
unsigned long new,
return __cmpxchg_u64_local(ptr, old, new);
 #endif
}
-   __cmpxchg_called_with_bad_pointer();
+   BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_local");
return old;
 }
 
-- 
2.5.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: introduce {cmp}xchg for u8 and u16

2016-04-07 Thread Pan Xinhui
From: pan xinhui 

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

Atomic operation on 8-bit and 16-bit data type is supported from power7

Signed-off-by: pan xinhui 
---
 arch/powerpc/include/asm/cmpxchg.h | 265 +
 1 file changed, 265 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..928ad89 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -15,6 +15,74 @@
  */
 
 static __always_inline unsigned long
+__xchg_u8_local(volatile void *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:lbarx   %0,0,%2\n"
+   PPC405_ERR77(0,%2)
+"  stbcx.  %3,0,%2\n"
+"  bne-1b"
+   : "=&r" (prev), "+m" (*(volatile unsigned char *)p)
+   : "r" (p), "r" (val)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u8_relaxed(u8 *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:lbarx   %0,0,%2\n"
+   PPC405_ERR77(0,%2)
+"  stbcx.  %3,0,%2\n"
+"  bne-1b"
+   : "=&r" (prev), "+m" (*p)
+   : "r" (p), "r" (val)
+   : "cc");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_local(volatile void *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:lharx   %0,0,%2\n"
+   PPC405_ERR77(0,%2)
+"  sthcx.  %3,0,%2\n"
+"  bne-1b"
+   : "=&r" (prev), "+m" (*(volatile unsigned short *)p)
+   : "r" (p), "r" (val)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__xchg_u16_relaxed(u16 *p, unsigned long val)
+{
+   unsigned long prev;
+
+   __asm__ __volatile__(
+"1:lharx   %0,0,%2\n"
+   PPC405_ERR77(0,%2)
+"  sthcx.  %3,0,%2\n"
+"  bne-1b"
+   : "=&r" (prev), "+m" (*p)
+   : "r" (p), "r" (val)
+   : "cc");
+
+   return prev;
+}
+
+static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
unsigned long prev;
@@ -88,6 +156,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_local(ptr, x);
+   case 2:
+   return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +175,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_relaxed(ptr, x);
+   case 2:
+   return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -132,6 +208,179 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int 
size)
  */
 
 static __always_inline unsigned long
+__cmpxchg_u8(volatile unsigned char *p, unsigned long old, unsigned long new)
+{
+   unsigned long prev = 0;
+
+   __asm__ __volatile__ (
+   PPC_ATOMIC_ENTRY_BARRIER
+"1:lbarx   %0,0,%2 # __cmpxchg_u8\n"
+"  cmpw0,%0,%3\n"
+"  bne-2f\n"
+   PPC405_ERR77(0,%2)
+"  stbcx.  %4,0,%2\n"
+"  bne-1b\n"
+   PPC_ATOMIC_EXIT_BARRIER
+   "\n"
+"2:"
+   : "=&r" (prev), "+m" (*p)
+   : "r" (p), "r" (old), "r" (new)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_local(volatile unsigned char *p, unsigned long old,
+   unsigned long new)
+{
+   unsigned long prev = 0;
+
+   __asm__ __volatile__ (
+"1:lbarx   %0,0,%2 # __cmpxchg_u8_local\n"
+"  cmpw0,%0,%3\n"
+"  bne-2f\n"
+   PPC405_ERR77(0,%2)
+"  stbcx.  %4,0,%2\n"
+"  bne-1b\n"
+"2:"
+   : "=&r" (prev), "+m" (*p)
+   : "r" (p), "r" (old), "r" (new)
+   : "cc", "memory");
+
+   return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u8_relaxed(u8 *p, unsigned long old, unsigned long new)
+{
+   unsigned long prev = 0;
+
+   __asm__ __volatile__ (
+"1:lbarx   %0,0,%2 # __cmpxchg_u8_relaxed\n"
+

Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16

2016-04-10 Thread Pan Xinhui

On 2016年04月08日 15:47, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>> From: pan xinhui 
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> Atomic operation on 8-bit and 16-bit data type is supported from power7
> 
> And yes I see nothing P7 specific here, this implementation is for
> everything PPC64 afaict, no?
> 
Hello Peter,
No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for 
pointing it out.
We might need a new config option and let it depend on POWER7/POWER8_CPU or 
even POWER9...

> Also, note that you don't need explicit 8/16 bit atomics to implement
> these. Its fine to use 32bit atomics and only modify half the word.
> 
That is true. But I am a little worried about the performance. It will forbid 
any other tasks to touch the other half word during the load/reserve, right?
I am working on the qspinlock implementation on PPC.
Your and Waiman's patches are so nice. :)

> Also, you might want to invest in some CPP to reduce the endless
> repetition.
> 
Will do that. thanks for your tips.

thanks
xinhui
> Other than that, no objections :-)
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16

2016-04-13 Thread Pan Xinhui
Hello Peter,

On 2016年04月12日 22:30, Peter Zijlstra wrote:
> On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
>>
>> On 2016年04月08日 15:47, Peter Zijlstra wrote:
>>> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>>>> From: pan xinhui 
>>>>
>>>> Implement xchg{u8,u16}{local,relaxed}, and
>>>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>>>
>>>> Atomic operation on 8-bit and 16-bit data type is supported from power7
>>>
>>> And yes I see nothing P7 specific here, this implementation is for
>>> everything PPC64 afaict, no?
>>>
>> Hello Peter,
>>  No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for 
>> pointing it out.
>> We might need a new config option and let it depend on POWER7/POWER8_CPU or 
>> even POWER9...
> 
> Right, I'm not sure if PPC has alternatives, but you could of course
> runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
> ll/sc if present on the current CPU if you have infrastructure for these
> things.
> 
seems interesting. I have no idea about how to runtime patch the code. I will 
try to learn that.
If so, we need change {cmp}xchg into uninline functions?

>>> Also, note that you don't need explicit 8/16 bit atomics to implement
>>> these. Its fine to use 32bit atomics and only modify half the word.
>>>
>> That is true. But I am a little worried about the performance. It will
>> forbid any other tasks to touch the other half word during the
>> load/reserve, right?
> 
> Well, not forbid, it would just make the LL/SC fail and try again. Other
> archs already implement them this way. See commit 3226aad81aa6 ("sh:
> support 1 and 2 byte xchg") for example.
> 
thanks for your explanation. :)

I wrote one similar patch as you suggested.

I paste the new __xchg_u8's alpha implementation here. it need rewrite to be 
understood easily...
It does work, but some performance tests are needed later.

static __always_inline unsigned long
__xchg_u8_local(volatile void *p, unsigned char val)
{
unsigned int prev, prev_mask, tmp, offset, _val, *_p;

_p = (unsigned int *)round_down((unsigned long)p, sizeof(int));
_val = val;
offset = 8 * ( (unsigned long)p - (unsigned long )_p) ;
#ifndef CONFIG_CPU_LITTLE_ENDIAN
offset = 8 * (sizeof(int) - sizeof(__typeof__(val))) - offset;
#endif
_val <<= offset;
prev_mask = ~((unsigned int)(__typeof__ (val))-1 << offset);

__asm__ __volatile__(
"1: lwarx   %0,0,%3\n"
"   and %1,%0,%5\n"
"   or %1,%1,%4\n"
PPC405_ERR77(0,%2)
"   stwcx.  %1,0,%3\n"
"   bne-1b"
: "=&r" (prev), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
: "r" (_p), "r" (_val), "r" (prev_mask)
: "cc", "memory");

return prev >> offset;
}

>> I am working on the qspinlock implementation on PPC.
>> Your and Waiman's patches are so nice. :)
> 
> Thanks!, last time I looked at PPC spinlocks they could not use things
> like ticket locks because PPC might be a guest and fairness blows etc..
> 
> You're making the qspinlock-paravirt thing work on PPC, or doing
> qspinlock only for bare-metal PPC?
> 
I am making the both work. :)
qspinlock works on PPC now. I am preparing the patches and will send them out 
in next weeks :)

The paravirt work is a little hard.
currently, there are pv_wait() and pv_kick(). but only pv_kick has the 
parameter cpu(who will hold the lock as soon as the lock is unlocked). 
We need parameter cpu(who holds the lock now) in pv_wait,too.

thanks
xinhui

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16

2016-04-14 Thread Pan Xinhui
Hello, Waiman

On 2016年04月13日 23:53, Waiman Long wrote:
> On 04/13/2016 07:15 AM, Pan Xinhui wrote:
>> Hello Peter,
>>
>> On 2016年04月12日 22:30, Peter Zijlstra wrote:
>>
>>>> I am working on the qspinlock implementation on PPC.
>>>> Your and Waiman's patches are so nice. :)
>>> Thanks!, last time I looked at PPC spinlocks they could not use things
>>> like ticket locks because PPC might be a guest and fairness blows etc..
>>>
>>> You're making the qspinlock-paravirt thing work on PPC, or doing
>>> qspinlock only for bare-metal PPC?
>>>
>> I am making the both work. :)
>> qspinlock works on PPC now. I am preparing the patches and will send them 
>> out in next weeks :)
> 
> What of performance improvement are you seeing in PPC?
> 
well, not good. I wrote one small benchmark which just increase a integer with 
spinlock hold.
the overhead of lock itself is high. But the fairness is good.
I just do the tests in guestOS, and the qspinlock does not make use of 
paravirt, but spinlock does. So the performance gap now is a little big.
looks like I need change the kernel config, and re-test.

I does not measure the system impact in real world now. Let's see the kernel 
build times with two different locks.
If possible, Could you share us how you do the performance test?

>> The paravirt work is a little hard.
>> currently, there are pv_wait() and pv_kick(). but only pv_kick has the 
>> parameter cpu(who will hold the lock as soon as the lock is unlocked).
>> We need parameter cpu(who holds the lock now) in pv_wait,too.
> 
> That can be doable to a certain extent. However, if the current lock holder 
> acquired the lock via the fastpath only. The CPU information is not logged 
> anywhere. For a contended lock, the information should be there.
> 
yes. Maybe we could use hashtable. We could put lock, lock holder in pv_node, 
too. :)
Just my thoughts.

thanks
xinhui

> Cheers, 
> Longman
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: introduce {cmp}xchg for u8 and u16

2016-04-18 Thread Pan Xinhui


On 2016年04月17日 03:43, Arnd Bergmann wrote:
> On Wednesday 13 April 2016 19:15:17 Pan Xinhui wrote:
>> Hello Peter,
>>
>> On 2016年04月12日 22:30, Peter Zijlstra wrote:
>>> On Sun, Apr 10, 2016 at 10:17:28PM +0800, Pan Xinhui wrote:
>>>>
>>>> On 2016年04月08日 15:47, Peter Zijlstra wrote:
>>>>> On Fri, Apr 08, 2016 at 02:41:46PM +0800, Pan Xinhui wrote:
>>>>>> From: pan xinhui 
>>>>>>
>>>>>> Implement xchg{u8,u16}{local,relaxed}, and
>>>>>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>>>>>
>>>>>> Atomic operation on 8-bit and 16-bit data type is supported from power7
>>>>>
>>>>> And yes I see nothing P7 specific here, this implementation is for
>>>>> everything PPC64 afaict, no?
>>>>>
>>>> Hello Peter,
>>>>No, it's not for every ppc. So yes, I need add #ifdef here. Thanks for 
>>>> pointing it out.
>>>> We might need a new config option and let it depend on POWER7/POWER8_CPU 
>>>> or even POWER9...
>>>
>>> Right, I'm not sure if PPC has alternatives, but you could of course
>>> runtime patch the code from emulated with 32bit ll/sc to native 8/16bit
>>> ll/sc if present on the current CPU if you have infrastructure for these
>>> things.
>>>
>> seems interesting. I have no idea about how to runtime patch the code. I 
>> will try to learn that.
>> If so, we need change {cmp}xchg into uninline functions?
> 
> I think you don't need to, see do_feature_fixups()/patch_feature_section()
> 
Hello, Arnd
thanks for your tips :) I will take a look at them.
This time I will make generic functions for all ppc. But in future, We will 
runtime patch the code.

> Note that an #ifdef by itself has to worry about any combination of
> architectures, so in a kernel that has both POWER6 and POWER7 enabled,
> you cannot call the POWER7-only function.
> 
seems your are right.
While I think it's not a good idea to enable several cpu types. just select the 
minimum supported cpu in real world. :)

thanks
xinhui
>   Arnd
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-18 Thread Pan Xinhui
From: Pan Xinhui 

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
change from V1:
rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 83 ++
 1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..79a1f45 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,37 @@
 #include 
 #include 
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)  ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)  (off * BITS_PER_BYTE)
+#endif
+
+static __always_inline unsigned long
+__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
+   unsigned long new);
+
+#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v)\
+static __always_inline u32 \
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)   \
+{  \
+   int size = sizeof (type);   \
+   int off = (unsigned long)ptr % sizeof(u32); \
+   volatile u32 *p = ptr - off;\
+   int bitoff = BITOFF_CAL(size, off); \
+   u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;\
+   u32 oldv, newv; \
+   u32 ret;\
+   do {\
+   oldv = READ_ONCE(*p);   \
+   ret = (oldv & bitmask) >> bitoff;   \
+   if (skip && ret != old) \
+   break;  \
+   newv = (oldv & ~bitmask) | (new << bitoff); \
+   } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
+   return ret; \
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +45,19 @@
  * the previous value stored there.
  */
 
+#define XCHG_GEN(type, sfx, v) \
+   __XCHG_GEN(_, type, sfx, _local, 0, v)  \
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)\
+{  \
+   return ___xchg_##type##sfx(p, 0, n);\
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -88,6 +132,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_local(ptr, x);
+   case 2:
+   return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +151,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_relaxed(ptr, x);
+   case 2:
+   return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned 
long new)
return prev;
 }
 
+
+#define CMPXCHG_GEN(type, sfx, v)  \
+   __XCHG_GEN(cmp, type, sfx, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local, volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned 
long new,
  unsigned int size)
 {
switch (size) {
+   case 1:
+   return __cmpxchg_u8(ptr, old, new);
+   case 2:
+   return __cmpxchg_u16(ptr, old, new);
case 4:
return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, 
unsigned long new,
  unsigned int size)
 {
switch (size) 

Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-19 Thread Pan Xinhui

Hello, boqun

On 2016年04月19日 17:18, Boqun Feng wrote:
> Hi Xinhui,
> 
> On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui 
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
> 
> Nice work!
> 
thank you.

> AFAICT, your work doesn't depend on anything that ppc-specific, right?
> So maybe we can use it as a general approach for a fallback
> implementation on the archs without u8/u16 atomics. ;-)
> 
>> Suggested-by: Peter Zijlstra (Intel) 
>> Signed-off-by: Pan Xinhui 
>> ---
>> change from V1:
>>  rework totally.
>> ---
>>  arch/powerpc/include/asm/cmpxchg.h | 83 
>> ++
>>  1 file changed, 83 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/cmpxchg.h 
>> b/arch/powerpc/include/asm/cmpxchg.h
>> index 44efe73..79a1f45 100644
>> --- a/arch/powerpc/include/asm/cmpxchg.h
>> +++ b/arch/powerpc/include/asm/cmpxchg.h
>> @@ -7,6 +7,37 @@
>>  #include 
>>  #include 
>>  
>> +#ifdef __BIG_ENDIAN
>> +#define BITOFF_CAL(size, off)   ((sizeof(u32) - size - off) * 
>> BITS_PER_BYTE)
>> +#else
>> +#define BITOFF_CAL(size, off)   (off * BITS_PER_BYTE)
>> +#endif
>> +
>> +static __always_inline unsigned long
>> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
>> +unsigned long new);
>> +
>> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \
>> +static __always_inline u32  \
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)\
>> +{   \
>> +int size = sizeof (type);   \
>> +int off = (unsigned long)ptr % sizeof(u32); \
>> +volatile u32 *p = ptr - off;\
>> +int bitoff = BITOFF_CAL(size, off); \
>> +u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;\
>> +u32 oldv, newv; \
>> +u32 ret;\
>> +do {\
>> +oldv = READ_ONCE(*p);   \
>> +ret = (oldv & bitmask) >> bitoff;   \
>> +if (skip && ret != old) \
>> +break;  \
>> +newv = (oldv & ~bitmask) | (new << bitoff); \
>> +} while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
> 
> Forgive me if this is too paranoid, but I think we can save the
> READ_ONCE() in the loop if we change the code into the following,
> because cmpxchg will return the "new" value, if the cmp part fails.
> 
>   newv = READ_ONCE(*p);
> 
>   do {
>   oldv = newv;
>   ret = (oldv & bitmask) >> bitoff;
>   if (skip && ret != old)
>   break;
>   newv = (oldv & ~bitmask) | (new << bitoff);
>   newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv);
>   } while(newv != oldv);
> 
>> +return ret; \
>> +}
a little optimization. Patch V3 will include your code, thanks.

>> +
>>  /*
>>   * Atomic exchange
>>   *
>> @@ -14,6 +45,19 @@
>>   * the previous value stored there.
>>   */
>>  
>> +#define XCHG_GEN(type, sfx, v)  
>> \
>> +__XCHG_GEN(_, type, sfx, _local, 0, v)  \
>  ^^^
> 
> This should be sfx, right? Otherwise, all the newly added xchg will
> call __cmpxchg_u32_local, this will result in wrong ordering guarantees.
> 
I mean that. But I will think of the ordering issue for a while. :)

>> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \
>> +{   \
>> +return ___xchg_##type##sfx(p, 0, n);\
>> +}
>> +
>> +XCHG_GEN(u8, _local, volatile);
> 
> I don't think we need the "volatile" modifier here, because READ_ONCE()
> and _

[PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-20 Thread Pan Xinhui
From: Pan Xinhui 

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.
The basic idea is from commit 3226aad81aa6 ("sh: support 1 and 2 byte xchg")

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
change from v2:
in the do{}while(), we save one load and use corresponding cmpxchg 
suffix.
Also add corresponding __cmpxchg_u32 function declaration in the 
__XCHG_GEN 
change from V1:
rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 83 ++
 1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..2aec04e 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,38 @@
 #include 
 #include 
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)  ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)  (off * BITS_PER_BYTE)
+#endif
+
+#define __XCHG_GEN(cmp, type, sfx, skip, v)\
+static __always_inline unsigned long   \
+__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,   \
+unsigned long new);\
+static __always_inline u32 \
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)   \
+{  \
+   int size = sizeof (type);   \
+   int off = (unsigned long)ptr % sizeof(u32); \
+   volatile u32 *p = ptr - off;\
+   int bitoff = BITOFF_CAL(size, off); \
+   u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;\
+   u32 oldv, newv, tmp;\
+   u32 ret;\
+   oldv = READ_ONCE(*p);   \
+   do {\
+   ret = (oldv & bitmask) >> bitoff;   \
+   if (skip && ret != old) \
+   break;  \
+   newv = (oldv & ~bitmask) | (new << bitoff); \
+   tmp = oldv; \
+   oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);   \
+   } while (tmp != oldv);  \
+   return ret; \
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +46,19 @@
  * the previous value stored there.
  */
 
+#define XCHG_GEN(type, sfx, v) \
+   __XCHG_GEN(_, type, sfx, 0, v)  \
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n)
\
+{  \
+   return ___xchg_##type##sfx(p, 0, n);\
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -88,6 +133,10 @@ static __always_inline unsigned long
 __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_local(ptr, x);
+   case 2:
+   return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +152,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
switch (size) {
+   case 1:
+   return __xchg_u8_relaxed(ptr, x);
+   case 2:
+   return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +184,20 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int 
size)
  * and return the old value of *p.
  */
 
+#define CMPXCHG_GEN(type, sfx, v)  \
+   __XCHG_GEN(cmp, type, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local,volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -316,6 +383,10 @@ __cmpxc

Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-21 Thread Pan Xinhui
On 2016年04月20日 22:24, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> 
>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
>> +static __always_inline unsigned long
>> \
>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,\
>> + unsigned long new);\
>> +static __always_inline u32  \
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)\
>> +{   \
>> +int size = sizeof (type);   \
>> +int off = (unsigned long)ptr % sizeof(u32); \
>> +volatile u32 *p = ptr - off;\
>> +int bitoff = BITOFF_CAL(size, off); \
>> +u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;\
>> +u32 oldv, newv, tmp;\
>> +u32 ret;\
>> +oldv = READ_ONCE(*p);   \
>> +do {\
>> +ret = (oldv & bitmask) >> bitoff;   \
>> +if (skip && ret != old) \
>> +break;  \
>> +newv = (oldv & ~bitmask) | (new << bitoff); \
>> +tmp = oldv; \
>> +oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);   \
>> +} while (tmp != oldv);  \
>> +return ret; \
>> +}
> 
> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> 
> Why did you choose to write it entirely in C?
> 
yes, you are right. more load/store will be done in C code.
However such xchg_u8/u16 is just used by qspinlock now. and I did not see any 
performance regression.
So just wrote in C, for simple. :)

Of course I have done xchg tests.
we run code just like xchg((u8*)&v, j++); in several threads.
and the result is,
[  768.374264] use time[1550072]ns in xchg_u8_asm
[  768.377102] use time[2826802]ns in xchg_u8_c

I think this is because there is one more load in C.
If possible, we can move such code in asm-generic/.

thanks
xinhui

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-21 Thread Pan Xinhui
On 2016年04月21日 23:52, Boqun Feng wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> On 2016年04月20日 22:24, Peter Zijlstra wrote:
>>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
>>>
>>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v)   
>>>> \
>>>> +static __always_inline unsigned long  
>>>> \
>>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old,  \
>>>> +   unsigned long new);\
>>>> +static __always_inline u32
>>>> \
>>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new)  \
>>>> +{ \
>>>> +  int size = sizeof (type);   \
>>>> +  int off = (unsigned long)ptr % sizeof(u32); \
>>>> +  volatile u32 *p = ptr - off;\
>>>> +  int bitoff = BITOFF_CAL(size, off); \
>>>> +  u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff;\
>>>> +  u32 oldv, newv, tmp;\
>>>> +  u32 ret;\
>>>> +  oldv = READ_ONCE(*p);   \
>>>> +  do {\
>>>> +  ret = (oldv & bitmask) >> bitoff;   \
>>>> +  if (skip && ret != old) \
>>>> +  break;  \
>>>> +  newv = (oldv & ~bitmask) | (new << bitoff); \
>>>> +  tmp = oldv; \
>>>> +  oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv);   \
>>>> +  } while (tmp != oldv);  \
>>>> +  return ret; \
>>>> +}
>>>
>>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
>>>
>>> Why did you choose to write it entirely in C?
>>>
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see 
>> any performance regression.
>> So just wrote in C, for simple. :)
>>
>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
> 
> How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> loop with shifting and masking in it?
> 
yes, using 32bit ll/sc loops.

looks like:
__asm__ __volatile__(
"1: lwarx   %0,0,%3\n"
"   and %1,%0,%5\n"
"   or %1,%1,%4\n"
   PPC405_ERR77(0,%2)
"   stwcx.  %1,0,%3\n"
"   bne-1b"
: "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
: "r" (_p), "r" (_newv), "r" (_oldv_mask)
: "cc", "memory");


> Regards,
> Boqun
> 
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
>>
>> thanks
>> xinhui
>>

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-25 Thread Pan Xinhui

On 2016年04月22日 00:13, Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see 
>> any performance regression.
>> So just wrote in C, for simple. :)
> 
> Which is fine; but worthy of a note in your Changelog.
> 
will do that.

>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [  768.374264] use time[1550072]ns in xchg_u8_asm
>> [  768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
> 
> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> but there are things a CPU can do to optimize these loops.
> 
> For example, a CPU might choose to not release the exclusive hold of the
> line for a number of cycles, except when it passes SC or an interrupt
> happens. This way there's a smaller chance the SC fails and inhibits
> forward progress.
I am not sure if there is such hardware optimization.

> 
> By doing the modification outside of the LL/SC you loose such
> advantages.
> 
> And yes, doing a !exclusive load prior to the exclusive load leads to an
> even bigger window where the data can get changed out from under you.
> 
you are right.
We have observed such data change during the two different loads.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-26 Thread Pan Xinhui

On 2016年04月25日 23:37, Peter Zijlstra wrote:
> On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
>>> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
>>> but there are things a CPU can do to optimize these loops.
>>>
>>> For example, a CPU might choose to not release the exclusive hold of the
>>> line for a number of cycles, except when it passes SC or an interrupt
>>> happens. This way there's a smaller chance the SC fails and inhibits
>>> forward progress.
> 
>> I am not sure if there is such hardware optimization.
> 
> So I think the hardware must do _something_, otherwise competing cores
> doing load-exlusive could life-lock a system, each one endlessly
> breaking the exclusive ownership of the other and the store-conditional
> always failing.
> 
Seems there is no such optimization.

We haver observed SC fails almost all the time in a contention tests, then got 
stuck in the loop. :(
one thread modify val with LL/SC, and other threads just modify val without any 
respect to LL/SC.

So in the end, I choose to rewrite this patch in asm. :)

> Of course, there are such implementations, and they tend to have to put
> in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
> ARC for an example that needs to do this.)
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-27 Thread Pan Xinhui
From: Pan Xinhui 

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

remove volatile of first parameter in __cmpxchg_local and __cmpxchg

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
change from v3:
rewrite in asm for the LL/SC.
remove volatile in __cmpxchg_local and __cmpxchg.
change from v2:
in the do{}while(), we save one load and use corresponding cmpxchg 
suffix.
Also add corresponding __cmpxchg_u32 function declaration in the 
__XCHG_GEN 
change from V1:
rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 109 -
 1 file changed, 106 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h 
b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..8a3735f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,71 @@
 #include 
 #include 
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)  ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)  (off * BITS_PER_BYTE)
+#endif
+
+#define XCHG_GEN(type, sfx, cl)\
+static inline u32 __xchg_##type##sfx(void *p, u32 val) \
+{  \
+   unsigned int prev, prev_mask, tmp, bitoff, off; \
+   \
+   off = (unsigned long)p % sizeof(u32);   \
+   bitoff = BITOFF_CAL(sizeof(type), off); \
+   p -= off;   \
+   val <<= bitoff; \
+   prev_mask = (u32)(type)-1 << bitoff;\
+   \
+   __asm__ __volatile__(   \
+"1:lwarx   %0,0,%3\n"  \
+"  andc%1,%0,%5\n" \
+"  or  %1,%1,%4\n" \
+   PPC405_ERR77(0,%3)  \
+"  stwcx.  %1,0,%3\n"  \
+"  bne-1b\n"   \
+   : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)\
+   : "r" (p), "r" (val), "r" (prev_mask)   \
+   : "cc", cl);\
+   \
+   return prev >> bitoff;  \
+}
+
+#define CMPXCHG_GEN(type, sfx, br, br2, cl)\
+static inline  \
+u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)   \
+{  \
+   unsigned int prev, prev_mask, tmp, bitoff, off; \
+   \
+   off = (unsigned long)p % sizeof(u32);   \
+   bitoff = BITOFF_CAL(sizeof(type), off); \
+   p -= off;   \
+   old <<= bitoff; \
+   new <<= bitoff; \
+   prev_mask = (u32)(type)-1 << bitoff;\
+   \
+   __asm__ __volatile__(   \
+   br  \
+"1:lwarx   %0,0,%3\n"  \
+"  and %1,%0,%6\n" \
+"  cmpw0,%1,%4\n"  \
+"  bne-2f\n"   \
+"  andc%1,%0,%6\n" \
+"  or  %1,%1,%5\n" \
+   PPC405_ERR77(0,%3)  \
+"  stwcx.  %1,0,%3\n"  \
+"  bne-1b\n"   \
+   br2 \
+   "\n"\
+"2:"   \
+   : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)\
+   : "r" (p), "r" (old), "r" (new), "r" (prev_mask)\
+   : "cc", cl);   

Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-28 Thread Pan Xinhui


On 2016年04月27日 22:59, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
>>
>> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
>> this, I should really go to bed soon...
>>
>> But still, we can save the "tmp" for xchg() I think.
>>
> 
> No.. we can't. Sorry for all the noise.
> 
> This patch looks good to me.
> 
> FWIW, you can add
> 
> Acked-by: Boqun Feng 
> 
thanks!

> Regards,
> Boqun
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

2016-04-28 Thread Pan Xinhui


On 2016年04月28日 15:59, Peter Zijlstra wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui 
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
>> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>>
>> Suggested-by: Peter Zijlstra (Intel) 
>> Signed-off-by: Pan Xinhui 
> 
> Generally has the right shape; and I trust others to double check the
> ppc-asm minutia.
> 
> Acked-by: Peter Zijlstra (Intel) 
> 
> 
thanks!

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: enable qspinlock and its virtualization support

2016-04-28 Thread Pan Xinhui
From: Pan Xinhui 

This path aims to enable qspinlock on PPC. And on pseries platform, it also 
support
paravirt qspinlock.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h   | 37 +++
 arch/powerpc/include/asm/qspinlock_paravirt.h  | 36 +++
 .../powerpc/include/asm/qspinlock_paravirt_types.h | 13 ++
 arch/powerpc/include/asm/spinlock.h| 31 -
 arch/powerpc/include/asm/spinlock_types.h  |  4 ++
 arch/powerpc/kernel/paravirt.c | 52 ++
 arch/powerpc/lib/locks.c   | 32 +
 arch/powerpc/platforms/pseries/setup.c |  5 +++
 8 files changed, 198 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..d4e4dc3
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#definequeued_spin_unlock queued_spin_unlock
+
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+   /* no load/store can be across the unlock()*/
+   smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+#include 
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+   pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   native_queued_spin_unlock(lock);
+}
+#endif
+
+#include 
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000..86e81c3
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,36 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+   pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val)
+{
+   pv_lock_op.wait(ptr, val, -1);
+}
+
+static inline void pv_kick(int cpu)
+{
+   pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h 
b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 000..e1fdeb0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+   void (*lock)(struct qspinlock *lock, u32 val);
+   void (*unlock)(struct qspinlock *lock);
+   void (*wait)(u8 *ptr, u8 val, int cpu);
+   void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 523673d..3b65372 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,24 @@
 #define SYNC_IO
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu);
+extern void __spin_wake_cpu(int cpu);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x) barrier()
+#define __spin_wake_cpu(x) barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -106,18 +124,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#defi

[PATCH resend] powerpc: enable qspinlock and its virtualization support

2016-04-28 Thread Pan Xinhui
From: Pan Xinhui 

This patch aims to enable qspinlock on PPC. And on pseries platform, it also 
support
paravirt qspinlock.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h   | 37 +++
 arch/powerpc/include/asm/qspinlock_paravirt.h  | 36 +++
 .../powerpc/include/asm/qspinlock_paravirt_types.h | 13 ++
 arch/powerpc/include/asm/spinlock.h| 31 -
 arch/powerpc/include/asm/spinlock_types.h  |  4 ++
 arch/powerpc/kernel/paravirt.c | 52 ++
 arch/powerpc/lib/locks.c   | 32 +
 arch/powerpc/platforms/pseries/setup.c |  5 +++
 8 files changed, 198 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..d4e4dc3
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#definequeued_spin_unlock queued_spin_unlock
+
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+   /* no load/store can be across the unlock()*/
+   smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+#include 
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+   pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   native_queued_spin_unlock(lock);
+}
+#endif
+
+#include 
+
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000..86e81c3
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,36 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+   pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val)
+{
+   pv_lock_op.wait(ptr, val, -1);
+}
+
+static inline void pv_kick(int cpu)
+{
+   pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h 
b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 000..e1fdeb0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+   void (*lock)(struct qspinlock *lock, u32 val);
+   void (*unlock)(struct qspinlock *lock);
+   void (*wait)(u8 *ptr, u8 val, int cpu);
+   void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 523673d..3b65372 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,24 @@
 #define SYNC_IO
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu);
+extern void __spin_wake_cpu(int cpu);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x) barrier()
+#define __spin_wake_cpu(x) barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -106,18 +124,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#defi

Re: [PATCH resend] powerpc: enable qspinlock and its virtualization support

2016-04-28 Thread Pan Xinhui

On 2016年04月29日 05:07, Waiman Long wrote:
> On 04/28/2016 06:55 AM, Pan Xinhui wrote:
>> From: Pan Xinhui
>>
>> This patch aims to enable qspinlock on PPC. And on pseries platform, it also 
>> support
>> paravirt qspinlock.
>>
>> Signed-off-by: Pan Xinhui
>> ---
>>   arch/powerpc/include/asm/qspinlock.h   | 37 +++
>>   arch/powerpc/include/asm/qspinlock_paravirt.h  | 36 +++
>>   .../powerpc/include/asm/qspinlock_paravirt_types.h | 13 ++
>>   arch/powerpc/include/asm/spinlock.h| 31 -
>>   arch/powerpc/include/asm/spinlock_types.h  |  4 ++
>>   arch/powerpc/kernel/paravirt.c | 52 
>> ++
>>   arch/powerpc/lib/locks.c   | 32 +
>>   arch/powerpc/platforms/pseries/setup.c |  5 +++
>>   8 files changed, 198 insertions(+), 12 deletions(-)
>>   create mode 100644 arch/powerpc/include/asm/qspinlock.h
>>   create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
>>   create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
>>   create mode 100644 arch/powerpc/kernel/paravirt.c
>>
>>
> 
> This is just an enablement patch. You will also need a patch to activate 
> qspinlock for, at lease, some PPC configs. Right?
> 
yep, I want to enable these config and makefile at last.
it just looks like

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2da380f..ae7c2f1 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_PPC_970_NAP) += idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)   += idle_power7.o
 procfs-y   := proc_powerpc.o
 obj-$(CONFIG_PROC_FS)  += $(procfs-y)
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)   += paravirt.o
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)  := rtas_pci.o
 obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..46632e4 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -21,6 +21,7 @@ config PPC_PSERIES
select HOTPLUG_CPU if SMP
select ARCH_RANDOM
select PPC_DOORBELL
+   select ARCH_USE_QUEUED_SPINLOCKS
default y
 
 config PPC_SPLPAR
@@ -127,3 +128,11 @@ config HV_PERF_CTRS
  systems. 24x7 is available on Power 8 systems.
 
   If unsure, select Y.
+
+config PARAVIRT_SPINLOCKS
+   bool "Paravirtialization support for qspinlock"
+   depends on PPC_SPLPAR && QUEUED_SPINLOCKS
+   default y
+   help
+ If platform supports virtualization, for example PowerVM, this option
+ can let guest have a better performace.
-- 
2.4.3

> It has dependency on the pv_wait() patch that I sent out extend the parameter 
> list. Some performance data on how PPC system will perform with and without 
> qspinlock will also be helpful data points.
> 
For now, pv_wait defined in ppc is static inline void pv_wait(u8 *ptr, u8 val)
My plan is that waiting your patch goes in kernel tree first, then I send out 
another patch to extend the parameter list.

yes, I need copy some performance data in my patch's comments.

thanks
xinhui
> Cheers,
> Longman
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: cputime: fix a compile warning

2016-12-01 Thread Pan Xinhui



在 2016/12/2 12:35, yjin 写道:


On 2016年12月02日 12:22, Balbir Singh wrote:

On Fri, Dec 2, 2016 at 3:15 PM, Michael Ellerman  wrote:

yanjiang@windriver.com writes:


diff --git a/arch/powerpc/include/asm/cputime.h 
b/arch/powerpc/include/asm/cputime.h
index 4f60db0..4423e97 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -228,7 +228,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned 
long clk)
   return (__force cputime_t) ct;
  }

-#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
+#define cputime64_to_clock_t(ct) \
+ (__force u64)(cputime_to_clock_t((cputime_t)(ct)))

Given the name of the function is "cputime64 to clock_t", surely we
should be returning a clock_t ?

Please fix it in cpuacct.c  Also check out git commit
527b0a76f41d062381adbb55c8eb61e32cb0bfc9
sched/cpuacct: Avoid %lld seq_printf warning


Hi Balbir,

Where can I find this commit?


hello,
it is in next tree. :)

commit 527b0a76f41d062381adbb55c8eb61e32cb0bfc9
Author: Martin Schwidefsky 
Date:   Fri Nov 11 15:27:49 2016 +0100

sched/cpuacct: Avoid %lld seq_printf warning

For s390 kernel builds I keep getting this warning:

 kernel/sched/cpuacct.c: In function 'cpuacct_stats_show':

 kernel/sched/cpuacct.c:298:25: warning: format '%lld' expects argument of 
type 'long long int', but argument 4 has type 'clock_t {aka long int}' 
[-Wformat=]
   seq_printf(sf, "%s %lld\n",

Silence the warning by adding an explicit cast.

Signed-off-by: Martin Schwidefsky 

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/2016142749.6545-1-schwidef...@de.ibm.com
Signed-off-by: Ingo Molnar 

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c..9add206 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
   cpuacct_stat_desc[stat],
-  cputime64_to_clock_t(val[stat]));
+  (long long)cputime64_to_clock_t(val[stat]));
}
 
return 0;



Thanks!
Yanjiang


Balbir






[PATCH v8 0/6] Implement qspinlock/pv-qspinlock on ppc

2016-12-05 Thread Pan Xinhui
d Context Switching 529.8578.1 564.2
Process Creation 408.4421.6 287.6
Shell Scripts (1 concurrent)1201.8   1215.31185.8
Shell Scripts (8 concurrent)3758.4   3799.33878.9
System Call Overhead1008.3   1122.61134.2
  =
System Benchmarks Index Score   1072.0   1108.91050.6
--------

Pan Xinhui (6):
  powerpc/qspinlock: powerpc support qspinlock
  powerpc: pSeries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  powerpc/pv-qspinlock: powerpc support pv-qspinlock
  powerpc: pSeries: Add pv-qspinlock build config/make
  powerpc/pv-qspinlock: Optimise native unlock path

 arch/powerpc/include/asm/qspinlock.h   |  93 
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  52 +++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/include/asm/spinlock.h|  35 +++--
 arch/powerpc/include/asm/spinlock_types.h  |   4 +
 arch/powerpc/kernel/Makefile   |   1 +
 arch/powerpc/kernel/paravirt.c | 157 +
 arch/powerpc/lib/locks.c   | 122 
 arch/powerpc/platforms/pseries/Kconfig |  16 +++
 arch/powerpc/platforms/pseries/setup.c |   5 +
 10 files changed, 485 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11



[PATCH v8 1/6] powerpc/qspinlock: powerpc support qspinlock

2016-12-05 Thread Pan Xinhui
This patch add basic code to enable qspinlock on powerpc. qspinlock is
one kind of fairlock implementation. And seen some performance improvement
under some scenarios.

queued_spin_unlock() release the lock by just one write of NULL to the
::locked field which sits at different places in the two endianness
system.

We override some arch_spin_XXX as powerpc has io_sync stuff which makes
sure the io operations are protected by the lock correctly.

There is another special case, see commit
2c610022711 ("locking/qspinlock: Fix spin_unlock_wait() some more")

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h  | 66 +++
 arch/powerpc/include/asm/spinlock.h   | 31 +--
 arch/powerpc/include/asm/spinlock_types.h |  4 ++
 arch/powerpc/lib/locks.c  | 59 +++
 4 files changed, 147 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..4c89256
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+#define queued_spin_is_locked queued_spin_is_locked
+#define queued_spin_unlock_wait queued_spin_unlock_wait
+
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+
+static inline u8 *__qspinlock_lock_byte(struct qspinlock *lock)
+{
+   return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   /* release semantics is required */
+   smp_store_release(__qspinlock_lock_byte(lock), 0);
+}
+
+static inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+   smp_mb();
+   return atomic_read(&lock->val);
+}
+
+#include 
+
+/* we need override it as ppc has io_sync stuff */
+#undef arch_spin_trylock
+#undef arch_spin_lock
+#undef arch_spin_lock_flags
+#undef arch_spin_unlock
+#define arch_spin_trylock arch_spin_trylock
+#define arch_spin_lock arch_spin_lock
+#define arch_spin_lock_flags arch_spin_lock_flags
+#define arch_spin_unlock arch_spin_unlock
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   return queued_spin_trylock(lock);
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+   SYNC_IO;
+   queued_spin_unlock(lock);
+}
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 8c1b913..954099e 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -60,6 +60,23 @@ static inline bool vcpu_is_preempted(int cpu)
 }
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
+
+#define arch_spin_relax(lock)  __spin_yield(lock)
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -114,18 +131,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)barrier()
-#define __rw_yield(x)  barrier()
-#define SHARED_PROCESSOR   0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
CLEAR_IO_SYNC;
@@ -203,6 +208,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t 
*lock)
smp_mb();
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -338,7 +344,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
-#define arch_spin_relax(lock)  __spin_yield(lock)
 #define arch_read_relax(loc

[PATCH v8 2/6] powerpc: pSeries/Kconfig: Add qspinlock build config

2016-12-05 Thread Pan Xinhui
pSeries/powerNV will use qspinlock from now on.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/platforms/pseries/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..8a87d06 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -23,6 +23,14 @@ config PPC_PSERIES
select PPC_DOORBELL
default y
 
+config ARCH_USE_QUEUED_SPINLOCKS
+   default y
+   bool "Enable qspinlock"
+   help
+ Enabling this option will let kernel use qspinlock which is a kind of
+ fairlock.  It has shown a good performance improvement on x86 and 
also ppc
+ especially in high contention cases.
+
 config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
-- 
2.4.11



[PATCH v8 3/6] powerpc: lib/locks.c: Add cpu yield/wake helper function

2016-12-05 Thread Pan Xinhui
Add two corresponding helper functions to support pv-qspinlock.

For normal use, __spin_yield_cpu will confer current vcpu slices to the
target vcpu(say, a lock holder). If target vcpu is not specified or it
is in running state, such conferging to lpar happens or not depends.

Because hcall itself will introduce latency and a little overhead. And we
do NOT want to suffer any latency on some cases, e.g. in interrupt handler.
The second parameter *confer* can indicate such case.

__spin_wake_cpu is simpiler, it will wake up one vcpu regardless of its
current vcpu state.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h |  4 +++
 arch/powerpc/lib/locks.c| 59 +
 2 files changed, 63 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 954099e..6426bd5 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -64,9 +64,13 @@ static inline bool vcpu_is_preempted(int cpu)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu, int confer);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x, y) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)  barrier()
 #define SHARED_PROCESSOR   0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6574626..bd872c9 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,65 @@
 #include 
 #include 
 
+/*
+ * confer our slices to a specified cpu and return. If it is in running state
+ * or cpu is -1, then we will check confer. If confer is NULL, we will return
+ * otherwise we confer our slices to lpar.
+ */
+void __spin_yield_cpu(int cpu, int confer)
+{
+   unsigned int holder_cpu = cpu, yield_count;
+
+   if (cpu == -1)
+   goto yield_to_lpar;
+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+   /* if cpu is running, confer slices to lpar conditionally*/
+   if ((yield_count & 1) == 0)
+   goto yield_to_lpar;
+
+   plpar_hcall_norets(H_CONFER,
+   get_hard_smp_processor_id(holder_cpu), yield_count);
+   return;
+
+yield_to_lpar:
+   if (confer)
+   plpar_hcall_norets(H_CONFER, -1, 0);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+   unsigned int holder_cpu = cpu;
+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   /*
+* NOTE: we should always do this hcall regardless of
+* the yield_count of the holder_cpu.
+* as thers might be a case like below;
+*  CPU 1   CPU 2
+*  yielded = true
+* if (yielded)
+*  __spin_wake_cpu()
+*  __spin_yield_cpu()
+*
+* So we might lose a wake if we check the yield_count and
+* return directly if the holder_cpu is running.
+* IOW. do NOT code like below.
+*  yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+*  if ((yield_count & 1) == 0)
+*  return;
+*
+* a PROD hcall marks the target_cpu proded, which cause the next cede
+* or confer called on the target_cpu invalid.
+*/
+   plpar_hcall_norets(H_PROD,
+   get_hard_smp_processor_id(holder_cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
-- 
2.4.11



[PATCH v8 4/6] powerpc/pv-qspinlock: powerpc support pv-qspinlock

2016-12-05 Thread Pan Xinhui
The default pv-qspinlock uses qspinlock(native version of pv-qspinlock).
pv_lock initialization should be done in bootstage with irq disabled.
And if we run as a guest with powerKVM/pHyp shared_processor mode,
restore pv_lock_ops callbacks to pv-qspinlock(pv version) which makes
full use of virtualization.

There is a hash table, we store cpu number into it and the key is lock.
So everytime pv_wait can know who is the lock holder by searching the
lock. Also store the lock in a per_cpu struct, and remove it when we own
the lock. Then pv_wait can know which lock we are spinning on. But the
cpu in the hash table might not be the correct lock holder, as for
performace issue, we does not take care of hash conflict.

Also introduce spin_lock_holder, which tells who owns the lock now.
currently the only user is spin_unlock_wait.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h   |  29 +++-
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  36 +
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/kernel/paravirt.c | 153 +
 arch/powerpc/lib/locks.c   |   8 +-
 arch/powerpc/platforms/pseries/setup.c |   5 +
 6 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 4c89256..8fd6349 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -15,7 +15,7 @@ static inline u8 *__qspinlock_lock_byte(struct qspinlock 
*lock)
return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
 }
 
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
/* release semantics is required */
smp_store_release(__qspinlock_lock_byte(lock), 0);
@@ -27,6 +27,33 @@ static inline int queued_spin_is_locked(struct qspinlock 
*lock)
return atomic_read(&lock->val);
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include 
+/*
+ * try to know who is the lock holder, however it is not always true
+ * Return:
+ * -1, we did not know the lock holder.
+ * other value, likely is the lock holder.
+ */
+extern int spin_lock_holder(void *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+   pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_queued_spin_unlock(lock);
+}
+#else
+#define spin_lock_holder(l) (-1)
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   native_queued_spin_unlock(lock);
+}
+#endif
+
 #include 
 
 /* we need override it as ppc has io_sync stuff */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000..d87cda0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,36 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+   pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val)
+{
+   pv_lock_op.wait(ptr, val);
+}
+
+static inline void pv_kick(int cpu)
+{
+   pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h 
b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 000..83611ed
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+   void (*lock)(struct qspinlock *lock, u32 val);
+   void (*unlock)(struct qspinlock *lock);
+   void (*wait)(u8 *ptr, u8 val);
+   void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
new file mode 100644
index 000..e697b17
--- /dev/null
+++ b/arch/powerpc/kernel/paravirt.c
@@ -0,0 +1,153 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * 

[PATCH v8 5/6] powerpc: pSeries: Add pv-qspinlock build config/make

2016-12-05 Thread Pan Xinhui
pSeries run as a guest and might need pv-qspinlock.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/kernel/Makefile   | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1925341..4780415 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_PPC_970_NAP) += idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)   += idle_book3s.o
 procfs-y   := proc_powerpc.o
 obj-$(CONFIG_PROC_FS)  += $(procfs-y)
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)   += paravirt.o
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)  := rtas_pci.o
 obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 8a87d06..0288c78 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -31,6 +31,14 @@ config ARCH_USE_QUEUED_SPINLOCKS
  fairlock.  It has shown a good performance improvement on x86 and 
also ppc
  especially in high contention cases.
 
+config PARAVIRT_SPINLOCKS
+   bool "Paravirtialization support for qspinlock"
+   depends on PPC_SPLPAR && QUEUED_SPINLOCKS
+   default y
+   help
+ If kernel need run as a guest then enable this option.
+ Generally it can let kernel have a better performace.
+
 config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
-- 
2.4.11



[PATCH v8 6/6] powerpc/pv-qspinlock: Optimise native unlock path

2016-12-05 Thread Pan Xinhui
Avoid a function call under native version of qspinlock. On powerNV,
bafore applying this patch, every unlock is expensive. This small
optimizes enhance the performance.

We use static_key with jump_label which removes unnecessary loads of
lppaca and its stuff.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock_paravirt.h | 18 +-
 arch/powerpc/kernel/paravirt.c|  4 
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
index d87cda0..8d39446 100644
--- a/arch/powerpc/include/asm/qspinlock_paravirt.h
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -6,12 +6,14 @@
 #define _ASM_QSPINLOCK_PARAVIRT_H
 
 #include  
+#include  
 
 extern void pv_lock_init(void);
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
 extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+extern struct static_key_true sharedprocessor_key;
 
 static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
 {
@@ -20,7 +22,21 @@ static inline void pv_queued_spin_lock(struct qspinlock 
*lock, u32 val)
 
 static inline void pv_queued_spin_unlock(struct qspinlock *lock)
 {
-   pv_lock_op.unlock(lock);
+   /*
+* on powerNV and pSeries with jump_label, code will be
+*  PowerNV:pSeries:
+*  nop;b 2f;
+*  native unlock   2:
+*  pv unlock;
+* In this way, we can do unlock quick in native case.
+*
+* IF jump_label is not enabled, we fall back into
+* if condition, IOW, ld && cmp && bne.
+*/
+   if (static_branch_likely(&sharedprocessor_key))
+   native_queued_spin_unlock(lock);
+   else
+   pv_lock_op.unlock(lock);
 }
 
 static inline void pv_wait(u8 *ptr, u8 val)
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
index e697b17..a0a000e 100644
--- a/arch/powerpc/kernel/paravirt.c
+++ b/arch/powerpc/kernel/paravirt.c
@@ -140,6 +140,9 @@ struct pv_lock_ops pv_lock_op = {
 };
 EXPORT_SYMBOL(pv_lock_op);
 
+struct static_key_true sharedprocessor_key = STATIC_KEY_TRUE_INIT;
+EXPORT_SYMBOL(sharedprocessor_key);
+
 void __init pv_lock_init(void)
 {
if (SHARED_PROCESSOR) {
@@ -149,5 +152,6 @@ void __init pv_lock_init(void)
pv_lock_op.unlock = __pv_queued_spin_unlock;
pv_lock_op.wait = __pv_wait;
pv_lock_op.kick = __pv_kick;
+   static_branch_disable(&sharedprocessor_key);
}
 }
-- 
2.4.11



Re: [PATCH v8 1/6] powerpc/qspinlock: powerpc support qspinlock

2016-12-05 Thread Pan Xinhui

correct waiman's address.

在 2016/12/6 08:47, Boqun Feng 写道:

On Mon, Dec 05, 2016 at 10:19:21AM -0500, Pan Xinhui wrote:

This patch add basic code to enable qspinlock on powerpc. qspinlock is
one kind of fairlock implementation. And seen some performance improvement
under some scenarios.

queued_spin_unlock() release the lock by just one write of NULL to the
::locked field which sits at different places in the two endianness
system.

We override some arch_spin_XXX as powerpc has io_sync stuff which makes
sure the io operations are protected by the lock correctly.

There is another special case, see commit
2c610022711 ("locking/qspinlock: Fix spin_unlock_wait() some more")

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h  | 66 +++
 arch/powerpc/include/asm/spinlock.h   | 31 +--
 arch/powerpc/include/asm/spinlock_types.h |  4 ++
 arch/powerpc/lib/locks.c  | 59 +++
 4 files changed, 147 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..4c89256
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+#define queued_spin_is_locked queued_spin_is_locked
+#define queued_spin_unlock_wait queued_spin_unlock_wait
+
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+
+static inline u8 *__qspinlock_lock_byte(struct qspinlock *lock)
+{
+   return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   /* release semantics is required */
+   smp_store_release(__qspinlock_lock_byte(lock), 0);
+}
+
+static inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+   smp_mb();
+   return atomic_read(&lock->val);
+}
+
+#include 
+
+/* we need override it as ppc has io_sync stuff */
+#undef arch_spin_trylock
+#undef arch_spin_lock
+#undef arch_spin_lock_flags
+#undef arch_spin_unlock
+#define arch_spin_trylock arch_spin_trylock
+#define arch_spin_lock arch_spin_lock
+#define arch_spin_lock_flags arch_spin_lock_flags
+#define arch_spin_unlock arch_spin_unlock
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   return queued_spin_trylock(lock);
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+   SYNC_IO;
+   queued_spin_unlock(lock);
+}
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 8c1b913..954099e 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -60,6 +60,23 @@ static inline bool vcpu_is_preempted(int cpu)
 }
 #endif

+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
+
+#define arch_spin_relax(lock)  __spin_yield(lock)
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -114,18 +131,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)barrier()
-#define __rw_yield(x)  barrier()
-#define SHARED_PROCESSOR   0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
CLEAR_IO_SYNC;
@@ -203,6 +208,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t 
*lock)
smp_mb();
 }

+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -338,7 +344,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_

Re: [PATCH v8 2/6] powerpc: pSeries/Kconfig: Add qspinlock build config

2016-12-05 Thread Pan Xinhui



在 2016/12/6 08:58, Boqun Feng 写道:

On Mon, Dec 05, 2016 at 10:19:22AM -0500, Pan Xinhui wrote:

pSeries/powerNV will use qspinlock from now on.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/platforms/pseries/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..8a87d06 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig


Why here? Not arch/powerpc/platforms/Kconfig?


@@ -23,6 +23,14 @@ config PPC_PSERIES
select PPC_DOORBELL
default y

+config ARCH_USE_QUEUED_SPINLOCKS
+   default y
+   bool "Enable qspinlock"


I think you just enable qspinlock by default for all PPC platforms. I
guess you need to put

depends on PPC_PSERIES || PPC_POWERNV

here to achieve what you mean in you commit message.


yes, another good way.
I prefer to put it in pseries/Kconfig as same as pv-qspinlocks config.
when we build nv, it still include pSeries's config anyway.

thanks
xinhui


Regards,
Boqun


+   help
+ Enabling this option will let kernel use qspinlock which is a kind of
+ fairlock.  It has shown a good performance improvement on x86 and 
also ppc
+ especially in high contention cases.
+
 config PPC_SPLPAR
depends on PPC_PSERIES
bool "Support for shared-processor logical partitions"
--
2.4.11





Re: [PATCH v8 3/6] powerpc: lib/locks.c: Add cpu yield/wake helper function

2016-12-05 Thread Pan Xinhui



在 2016/12/6 09:23, Boqun Feng 写道:

On Mon, Dec 05, 2016 at 10:19:23AM -0500, Pan Xinhui wrote:

Add two corresponding helper functions to support pv-qspinlock.

For normal use, __spin_yield_cpu will confer current vcpu slices to the
target vcpu(say, a lock holder). If target vcpu is not specified or it
is in running state, such conferging to lpar happens or not depends.

Because hcall itself will introduce latency and a little overhead. And we
do NOT want to suffer any latency on some cases, e.g. in interrupt handler.
The second parameter *confer* can indicate such case.

__spin_wake_cpu is simpiler, it will wake up one vcpu regardless of its
current vcpu state.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h |  4 +++
 arch/powerpc/lib/locks.c| 59 +
 2 files changed, 63 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 954099e..6426bd5 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -64,9 +64,13 @@ static inline bool vcpu_is_preempted(int cpu)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu, int confer);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x, y) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)  barrier()
 #define SHARED_PROCESSOR   0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6574626..bd872c9 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,65 @@
 #include 
 #include 

+/*
+ * confer our slices to a specified cpu and return. If it is in running state
+ * or cpu is -1, then we will check confer. If confer is NULL, we will return
+ * otherwise we confer our slices to lpar.
+ */
+void __spin_yield_cpu(int cpu, int confer)
+{
+   unsigned int holder_cpu = cpu, yield_count;


As I said at:

https://marc.info/?l=linux-kernel&m=147455748619343&w=2

@holder_cpu is not necessary and doesn't help anything.


+
+   if (cpu == -1)
+   goto yield_to_lpar;
+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+   /* if cpu is running, confer slices to lpar conditionally*/
+   if ((yield_count & 1) == 0)
+   goto yield_to_lpar;
+
+   plpar_hcall_norets(H_CONFER,
+   get_hard_smp_processor_id(holder_cpu), yield_count);
+   return;
+
+yield_to_lpar:
+   if (confer)
+   plpar_hcall_norets(H_CONFER, -1, 0);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+   unsigned int holder_cpu = cpu;


And it's even wrong to call the parameter of _wake_cpu() a holder_cpu,
because it's not the current lock holder.


oh, its name is really misleading.

thanks


Regards,
Boqun


+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   /*
+* NOTE: we should always do this hcall regardless of
+* the yield_count of the holder_cpu.
+* as thers might be a case like below;
+*  CPU 1   CPU 2
+*  yielded = true
+* if (yielded)
+*  __spin_wake_cpu()
+*  __spin_yield_cpu()
+*
+* So we might lose a wake if we check the yield_count and
+* return directly if the holder_cpu is running.
+* IOW. do NOT code like below.
+*  yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+*  if ((yield_count & 1) == 0)
+*  return;
+*
+* a PROD hcall marks the target_cpu proded, which cause the next cede
+* or confer called on the target_cpu invalid.
+*/
+   plpar_hcall_norets(H_PROD,
+   get_hard_smp_processor_id(holder_cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
--
2.4.11





Re: [PATCH v8 2/6] powerpc: pSeries/Kconfig: Add qspinlock build config

2016-12-05 Thread Pan Xinhui



在 2016/12/6 09:24, Pan Xinhui 写道:



在 2016/12/6 08:58, Boqun Feng 写道:

On Mon, Dec 05, 2016 at 10:19:22AM -0500, Pan Xinhui wrote:

pSeries/powerNV will use qspinlock from now on.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/platforms/pseries/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..8a87d06 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig


Why here? Not arch/powerpc/platforms/Kconfig?


@@ -23,6 +23,14 @@ config PPC_PSERIES
 select PPC_DOORBELL
 default y

+config ARCH_USE_QUEUED_SPINLOCKS
+default y
+bool "Enable qspinlock"


I think you just enable qspinlock by default for all PPC platforms. I
guess you need to put

depends on PPC_PSERIES || PPC_POWERNV

here to achieve what you mean in you commit message.


oh, yes, need depends on PPC_PSERIES || PPC_POWERNV.


yes, another good way.
I prefer to put it in pseries/Kconfig as same as pv-qspinlocks config.
when we build nv, it still include pSeries's config anyway.

thanks
xinhui


Regards,
Boqun


+help
+  Enabling this option will let kernel use qspinlock which is a kind of
+  fairlock.  It has shown a good performance improvement on x86 and also 
ppc
+  especially in high contention cases.
+
 config PPC_SPLPAR
 depends on PPC_PSERIES
 bool "Support for shared-processor logical partitions"
--
2.4.11





[PATCH v9 0/6] Implement qspinlock/pv-qspinlock on ppc

2016-12-06 Thread Pan Xinhui
bufsize 8000 maxblocks   2851.7   2838.12785.5
Pipe Throughput 1221.9   1265.31250.4
Pipe-based Context Switching 529.8578.1 564.2
Process Creation 408.4421.6 287.6
Shell Scripts (1 concurrent)1201.8   1215.31185.8
Shell Scripts (8 concurrent)3758.4   3799.33878.9
System Call Overhead1008.3   1122.61134.2
  =
System Benchmarks Index Score   1072.0   1108.91050.6
----



Pan Xinhui (6):
  powerpc/qspinlock: powerpc support qspinlock
  powerpc: platforms/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  powerpc/pv-qspinlock: powerpc support pv-qspinlock
  powerpc: pSeries: Add pv-qspinlock build config/make
  powerpc/pv-qspinlock: Optimise native unlock path

 arch/powerpc/include/asm/qspinlock.h   |  93 
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  52 +++
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/include/asm/spinlock.h|  35 +++--
 arch/powerpc/include/asm/spinlock_types.h  |   4 +
 arch/powerpc/kernel/Makefile   |   1 +
 arch/powerpc/kernel/paravirt.c | 157 +
 arch/powerpc/lib/locks.c   | 123 
 arch/powerpc/platforms/Kconfig |   9 ++
 arch/powerpc/platforms/pseries/Kconfig |   8 ++
 arch/powerpc/platforms/pseries/setup.c |   5 +
 11 files changed, 487 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11



[PATCH v9 1/6] powerpc/qspinlock: powerpc support qspinlock

2016-12-06 Thread Pan Xinhui
This patch add basic code to enable qspinlock on powerpc. qspinlock is
one kind of fairlock implementation. And seen some performance improvement
under some scenarios.

queued_spin_unlock() release the lock by just one write of NULL to the
::locked field which sits at different places in the two endianness
system.

We override some arch_spin_XXX as powerpc has io_sync stuff which makes
sure the io operations are protected by the lock correctly.

There is another special case, see commit
2c610022711 ("locking/qspinlock: Fix spin_unlock_wait() some more")

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h  | 66 +++
 arch/powerpc/include/asm/spinlock.h   | 31 +--
 arch/powerpc/include/asm/spinlock_types.h |  4 ++
 arch/powerpc/lib/locks.c  | 62 +
 4 files changed, 150 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..4c89256
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+#define queued_spin_is_locked queued_spin_is_locked
+#define queued_spin_unlock_wait queued_spin_unlock_wait
+
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+
+static inline u8 *__qspinlock_lock_byte(struct qspinlock *lock)
+{
+   return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   /* release semantics is required */
+   smp_store_release(__qspinlock_lock_byte(lock), 0);
+}
+
+static inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+   smp_mb();
+   return atomic_read(&lock->val);
+}
+
+#include 
+
+/* we need override it as ppc has io_sync stuff */
+#undef arch_spin_trylock
+#undef arch_spin_lock
+#undef arch_spin_lock_flags
+#undef arch_spin_unlock
+#define arch_spin_trylock arch_spin_trylock
+#define arch_spin_lock arch_spin_lock
+#define arch_spin_lock_flags arch_spin_lock_flags
+#define arch_spin_unlock arch_spin_unlock
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   return queued_spin_trylock(lock);
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+   SYNC_IO;
+   queued_spin_unlock(lock);
+}
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 8c1b913..954099e 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -60,6 +60,23 @@ static inline bool vcpu_is_preempted(int cpu)
 }
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
+
+#define arch_spin_relax(lock)  __spin_yield(lock)
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -114,18 +131,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)barrier()
-#define __rw_yield(x)  barrier()
-#define SHARED_PROCESSOR   0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
CLEAR_IO_SYNC;
@@ -203,6 +208,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t 
*lock)
smp_mb();
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -338,7 +344,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
-#define arch_spin_relax(lock)  __spin_yield(lock)
 #define arch_read_relax(loc

[PATCH v9 2/6] powerpc: platforms/Kconfig: Add qspinlock build config

2016-12-06 Thread Pan Xinhui
pSeries/powerNV will use qspinlock from now on.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/platforms/Kconfig | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index fbdae83..3559bbf 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -20,6 +20,15 @@ source "arch/powerpc/platforms/44x/Kconfig"
 source "arch/powerpc/platforms/40x/Kconfig"
 source "arch/powerpc/platforms/amigaone/Kconfig"
 
+config ARCH_USE_QUEUED_SPINLOCKS
+depends on PPC_PSERIES || PPC_POWERNV
+bool "Enable qspinlock"
+default y
+help
+ Enabling this option will let kernel use qspinlock which is a kind of
+ fairlock.  It has shown a good performance improvement on x86 and also
+ ppc especially in high contention cases.
+
 config KVM_GUEST
bool "KVM Guest support"
default n
-- 
2.4.11



[PATCH v9 3/6] powerpc: lib/locks.c: Add cpu yield/wake helper function

2016-12-06 Thread Pan Xinhui
Add two corresponding helper functions to support pv-qspinlock.

For normal use, __spin_yield_cpu will confer current vcpu slices to the
target vcpu(say, a lock holder). If target vcpu is not specified or it
is in running state, such conferging to lpar happens or not depends.

Because hcall itself will introduce latency and a little overhead. And we
do NOT want to suffer any latency on some cases, e.g. in interrupt handler.
The second parameter *confer* can indicate such case.

__spin_wake_cpu is simpiler, it will wake up one vcpu regardless of its
current vcpu state.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h |  4 +++
 arch/powerpc/lib/locks.c| 57 +
 2 files changed, 61 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 954099e..6426bd5 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -64,9 +64,13 @@ static inline bool vcpu_is_preempted(int cpu)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu, int confer);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x, y) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)  barrier()
 #define SHARED_PROCESSOR   0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 8f6dbb0..dff0bfa 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,63 @@
 #include 
 #include 
 
+/*
+ * confer our slices to a specified cpu and return. If it is in running state
+ * or cpu is -1, then we will check confer. If confer is NULL, we will return
+ * otherwise we confer our slices to lpar.
+ */
+void __spin_yield_cpu(int cpu, int confer)
+{
+   unsigned int yield_count;
+
+   if (cpu == -1)
+   goto yield_to_lpar;
+
+   BUG_ON(cpu >= nr_cpu_ids);
+   yield_count = be32_to_cpu(lppaca_of(cpu).yield_count);
+
+   /* if cpu is running, confer slices to lpar conditionally*/
+   if ((yield_count & 1) == 0)
+   goto yield_to_lpar;
+
+   plpar_hcall_norets(H_CONFER,
+   get_hard_smp_processor_id(cpu), yield_count);
+   return;
+
+yield_to_lpar:
+   if (confer)
+   plpar_hcall_norets(H_CONFER, -1, 0);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+   BUG_ON(cpu >= nr_cpu_ids);
+   /*
+* NOTE: we should always do this hcall regardless of
+* the yield_count of the holder_cpu.
+* as thers might be a case like below;
+*  CPU 1   CPU 2
+*  yielded = true
+* if (yielded)
+*  __spin_wake_cpu()
+*  __spin_yield_cpu()
+*
+* So we might lose a wake if we check the yield_count and
+* return directly if the holder_cpu is running.
+* IOW. do NOT code like below.
+*  yield_count = be32_to_cpu(lppaca_of(cpu).yield_count);
+*  if ((yield_count & 1) == 0)
+*  return;
+*
+* a PROD hcall marks the target_cpu proded, which cause the next cede
+* or confer called on the target_cpu invalid.
+*/
+   plpar_hcall_norets(H_PROD,
+   get_hard_smp_processor_id(cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
-- 
2.4.11



[PATCH v9 4/6] powerpc/pv-qspinlock: powerpc support pv-qspinlock

2016-12-06 Thread Pan Xinhui
The default pv-qspinlock uses qspinlock(native version of pv-qspinlock).
pv_lock initialization should be done in bootstage with irq disabled.
And if we run as a guest with powerKVM/pHyp shared_processor mode,
restore pv_lock_ops callbacks to pv-qspinlock(pv version) which makes
full use of virtualization.

There is a hash table, we store cpu number into it and the key is lock.
So everytime pv_wait can know who is the lock holder by searching the
lock. Also store the lock in a per_cpu struct, and remove it when we own
the lock. Then pv_wait can know which lock we are spinning on. But the
cpu in the hash table might not be the correct lock holder, as for
performace issue, we does not take care of hash conflict.

Also introduce spin_lock_holder, which tells who owns the lock now.
currently the only user is spin_unlock_wait.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h   |  29 +++-
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  36 +
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/kernel/paravirt.c | 153 +
 arch/powerpc/lib/locks.c   |   8 +-
 arch/powerpc/platforms/pseries/setup.c |   5 +
 6 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 4c89256..8fd6349 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -15,7 +15,7 @@ static inline u8 *__qspinlock_lock_byte(struct qspinlock 
*lock)
return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
 }
 
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
/* release semantics is required */
smp_store_release(__qspinlock_lock_byte(lock), 0);
@@ -27,6 +27,33 @@ static inline int queued_spin_is_locked(struct qspinlock 
*lock)
return atomic_read(&lock->val);
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include 
+/*
+ * try to know who is the lock holder, however it is not always true
+ * Return:
+ * -1, we did not know the lock holder.
+ * other value, likely is the lock holder.
+ */
+extern int spin_lock_holder(void *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+   pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_queued_spin_unlock(lock);
+}
+#else
+#define spin_lock_holder(l) (-1)
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   native_queued_spin_unlock(lock);
+}
+#endif
+
 #include 
 
 /* we need override it as ppc has io_sync stuff */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000..d87cda0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,36 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+   pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val)
+{
+   pv_lock_op.wait(ptr, val);
+}
+
+static inline void pv_kick(int cpu)
+{
+   pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h 
b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 000..83611ed
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+   void (*lock)(struct qspinlock *lock, u32 val);
+   void (*unlock)(struct qspinlock *lock);
+   void (*wait)(u8 *ptr, u8 val);
+   void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
new file mode 100644
index 000..e697b17
--- /dev/null
+++ b/arch/powerpc/kernel/paravirt.c
@@ -0,0 +1,153 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * 

[PATCH v9 5/6] powerpc: pSeries: Add pv-qspinlock build config/make

2016-12-06 Thread Pan Xinhui
pSeries run as a guest and might need pv-qspinlock.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/kernel/Makefile   | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1925341..4780415 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_PPC_970_NAP) += idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)   += idle_book3s.o
 procfs-y   := proc_powerpc.o
 obj-$(CONFIG_PROC_FS)  += $(procfs-y)
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)   += paravirt.o
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)  := rtas_pci.o
 obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..c9cc064 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -33,6 +33,14 @@ config PPC_SPLPAR
  processors, that is, which share physical processors between
  two or more partitions.
 
+config PARAVIRT_SPINLOCKS
+   bool "Paravirtialization support for qspinlock"
+   depends on PPC_SPLPAR && QUEUED_SPINLOCKS
+   default y
+   help
+ If kernel need run as a guest then enable this option.
+ Generally it can let kernel have a better performace.
+
 config DTL
bool "Dispatch Trace Log"
depends on PPC_SPLPAR && DEBUG_FS
-- 
2.4.11



[PATCH v9 6/6] powerpc/pv-qspinlock: Optimise native unlock path

2016-12-06 Thread Pan Xinhui
Avoid a function call under native version of qspinlock. On powerNV,
bafore applying this patch, every unlock is expensive. This small
optimizes enhance the performance.

We use static_key with jump_lable which removes unnecessary loads of
lppaca and its stuff.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock_paravirt.h | 18 +-
 arch/powerpc/kernel/paravirt.c|  4 
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
index d87cda0..8d39446 100644
--- a/arch/powerpc/include/asm/qspinlock_paravirt.h
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -6,12 +6,14 @@
 #define _ASM_QSPINLOCK_PARAVIRT_H
 
 #include  
+#include  
 
 extern void pv_lock_init(void);
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
 extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+extern struct static_key_true sharedprocessor_key;
 
 static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
 {
@@ -20,7 +22,21 @@ static inline void pv_queued_spin_lock(struct qspinlock 
*lock, u32 val)
 
 static inline void pv_queued_spin_unlock(struct qspinlock *lock)
 {
-   pv_lock_op.unlock(lock);
+   /*
+* on powerNV and pSeries with jump_label, code will be
+*  PowerNV:pSeries:
+*  nop;b 2f;
+*  native unlock   2:
+*  pv unlock;
+* In this way, we can do unlock quick in native case.
+*
+* IF jump_label is not enabled, we fall back into
+* if condition, IOW, ld && cmp && bne.
+*/
+   if (static_branch_likely(&sharedprocessor_key))
+   native_queued_spin_unlock(lock);
+   else
+   pv_lock_op.unlock(lock);
 }
 
 static inline void pv_wait(u8 *ptr, u8 val)
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
index e697b17..a0a000e 100644
--- a/arch/powerpc/kernel/paravirt.c
+++ b/arch/powerpc/kernel/paravirt.c
@@ -140,6 +140,9 @@ struct pv_lock_ops pv_lock_op = {
 };
 EXPORT_SYMBOL(pv_lock_op);
 
+struct static_key_true sharedprocessor_key = STATIC_KEY_TRUE_INIT;
+EXPORT_SYMBOL(sharedprocessor_key);
+
 void __init pv_lock_init(void)
 {
if (SHARED_PROCESSOR) {
@@ -149,5 +152,6 @@ void __init pv_lock_init(void)
pv_lock_op.unlock = __pv_queued_spin_unlock;
pv_lock_op.wait = __pv_wait;
pv_lock_op.kick = __pv_kick;
+   static_branch_disable(&sharedprocessor_key);
}
 }
-- 
2.4.11



[PATCH] powerpc/xmon: add turn off xmon option

2017-02-12 Thread Pan Xinhui
Once xmon is triggered, there is no interface to turn it off again.
However there exists disable/enable xmon code flows. And more important,
System reset interrupt on powerVM will fire an oops to make a dump. At
that time, xmon should not be triggered.

So add 'z' option after current 'x|X' exit commands. Turn xmon off if 'z'
is following.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/xmon/xmon.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..2f4e7b1 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_off = 0;
 
 static unsigned long adrs;
 static int size = 1;
@@ -255,8 +256,8 @@ Commands:\n\
   Sr # read SPR #\n\
   Sw #v write v to SPR #\n\
   tprint backtrace\n\
-  xexit monitor and recover\n\
-  Xexit monitor and don't recover\n"
+  x[z] exit monitor and recover, turn off xmon with 'z'\n\
+  X[z] exit monitor and don't recover, turn off xmon with 'z'\n"
 #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E)
 "  u   dump segment table or SLB\n"
 #elif defined(CONFIG_PPC_STD_MMU_32)
@@ -952,6 +953,8 @@ cmds(struct pt_regs *excp)
break;
case 'x':
case 'X':
+   if (inchar() == 'z')
+   xmon_off = 1;
return cmd;
case EOF:
printf(" \n");
@@ -3248,8 +3251,11 @@ static void xmon_init(int enable)
 static void sysrq_handle_xmon(int key)
 {
/* ensure xmon is enabled */
+   xmon_off = 0;
xmon_init(1);
debugger(get_irq_regs());
+   if (xmon_off)
+   xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {
@@ -3266,7 +3272,7 @@ static int __init setup_xmon_sysrq(void)
 __initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;
+static int __initdata xmon_early;
 
 static int __init early_parse_xmon(char *p)
 {
-- 
2.4.11



Re: [PATCH] powerpc/xmon: add debugfs entry for xmon

2017-02-13 Thread Pan Xinhui



在 2017/2/14 10:35, Nicholas Piggin 写道:

On Mon, 13 Feb 2017 19:00:42 -0200
"Guilherme G. Piccoli"  wrote:


Currently the xmon debugger is set only via kernel boot command-line.
It's disabled by default, and can be enabled with "xmon=on" on the
command-line. Also, xmon may be accessed via sysrq mechanism, but once
we enter xmon via sysrq,  it's  kept enabled until system is rebooted,
even if we exit the debugger. A kernel crash will then lead to xmon
instance, instead of triggering a kdump procedure (if configured), for
example.

This patch introduces a debugfs entry for xmon, allowing user to query
its current state and change it if desired. Basically, the "xmon" file
to read from/write to is under the debugfs mount point, on powerpc
directory. Reading this file will provide the current state of the
debugger, one of the following: "on", "off", "early" or "nobt". Writing
one of these states to the file will take immediate effect on the debugger.

Signed-off-by: Guilherme G. Piccoli 
---
* I had this patch partially done for some time, and after a discussion
at the kernel slack channel latest week, I decided to rebase and fix
some remaining bugs. I'd change 'x' option to always disable the debugger,
since with this patch we can always re-enable xmon, but today I noticed
Pan's patch on the mailing list, so perhaps his approach of adding a flag
to 'x' option is preferable. I can change this in a V2, if requested.
Thanks in advance!


xmon state changing after the first sysrq+x violates principle of least
astonishment, so I think that should be fixed.


hi, Nick
yes, as long as xmon is disabled during boot, it should still be disabled after 
existing xmon.
My patch does not fix that as it need people add one more char 'z' following 
'x'.
I will provide a new patch to fix that.


Then the question is, is it worth making it runtime configurable with xmon
command or debugfs tunables?


They are options for people to turn xmon features on or off. Maybe people 
needn't this.
However I am not a fan  of debugfs this time as I am used to using xmon cmds. :)

Hi, Guilherme
So in the end, my thought is that: 1) cmd x|X will exit xmon and keep xmon in 
the original state(indicated by var xmon_off).
2) Then add options to turn some features on/off. And debugfs maybe not fit for 
this. But I am also wondering at same time, are people needing this?

thanks
xinhui


Thanks,
Nick





[PATCH] powerpc/xmon: Fix an unexpected xmon onoff state change

2017-02-14 Thread Pan Xinhui
Once xmon is triggered by sysrq-x, it is enabled always afterwards even
if it is disabled during boot. This will cause a system reset interrut
fail to dump. So keep xmon in its original state after exit.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/xmon/xmon.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..721212f 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_off = 0;
 
 static unsigned long adrs;
 static int size = 1;
@@ -3250,6 +3251,8 @@ static void sysrq_handle_xmon(int key)
/* ensure xmon is enabled */
xmon_init(1);
debugger(get_irq_regs());
+   if (xmon_off)
+   xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {
@@ -3266,7 +3269,7 @@ static int __init setup_xmon_sysrq(void)
 __initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;
+static int __initdata xmon_early;
 
 static int __init early_parse_xmon(char *p)
 {
-- 
2.4.11



Re: [PATCH] powerpc/xmon: Fix an unexpected xmon onoff state change

2017-02-16 Thread Pan Xinhui



在 2017/2/16 18:57, Guilherme G. Piccoli 写道:

On 16/02/2017 03:09, Michael Ellerman wrote:

Pan Xinhui  writes:


Once xmon is triggered by sysrq-x, it is enabled always afterwards even
if it is disabled during boot. This will cause a system reset interrut
fail to dump. So keep xmon in its original state after exit.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/xmon/xmon.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..721212f 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */

 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_off = 0;

 static unsigned long adrs;
 static int size = 1;
@@ -3250,6 +3251,8 @@ static void sysrq_handle_xmon(int key)
/* ensure xmon is enabled */
xmon_init(1);
debugger(get_irq_regs());
+   if (xmon_off)
+   xmon_init(0);
 }


I don't think this is right.

xmon_off is only true if you boot with xmon=off on the command line.

So if you boot with CONFIG_XMON_DEFAULT=n, and nothing on the command
line, then enter xmon via sysrq, then exit, xmon will still be enabled.



Agreed, noticed it after some work in V2 of my patch. I'm addressing it
there, so maybe no harm in keeping this way here..


Hi, mpe
I cooked a new patch. And as Paul mentioned in slack, we need keep xmon 
on too if xmon=early is set in cmdline.

hi, Guilherme
feel free to include it in your new patchset. :)

thanks
xinhui

patch-
powerpc/xmon: Fix an unexpected xmon onoff state change

Once xmon is triggered by sysrq-x, it is enabled always afterwards even
if it is disabled during boot. This will cause a system reset interrupt
fail to dump. So keep xmon in its original state after exit.

We have several ways to set xmon on or off.
1) by a build config CONFIG_XMON_DEFAULT.
2) by a boot cmdline with xmon or xmon=early or xmon=on to enable xmon
and xmon=off to disable xmon. This value will override that in step 1.
3) by a debugfs interface. We need someone implement it in the future.
And this value can override those in step 1 and 2.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/xmon/xmon.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..f6e5c3d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;

+static int xmon_off = !IS_ENABLED(CONFIG_XMON_DEFAULT);
 
 static unsigned long adrs;

 static int size = 1;
@@ -3250,6 +3251,8 @@ static void sysrq_handle_xmon(int key)
/* ensure xmon is enabled */
xmon_init(1);
debugger(get_irq_regs());
+   if (xmon_off)
+   xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {

@@ -3266,16 +3269,16 @@ static int __init setup_xmon_sysrq(void)
 __initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;

+static int __initdata xmon_early;
 
 static int __init early_parse_xmon(char *p)

 {
if (!p || strncmp(p, "early", 5) == 0) {
/* just "xmon" is equivalent to "xmon=early" */
-   xmon_init(1);
xmon_early = 1;
+   xmon_off = 0;
} else if (strncmp(p, "on", 2) == 0)
-   xmon_init(1);
+   xmon_off = 0;
else if (strncmp(p, "off", 3) == 0)
xmon_off = 1;
else if (strncmp(p, "nobt", 4) == 0)
@@ -3289,10 +3292,9 @@ early_param("xmon", early_parse_xmon);
 
 void __init xmon_setup(void)

 {
-#ifdef CONFIG_XMON_DEFAULT
-   if (!xmon_off)
-   xmon_init(1);
-#endif
+   if (xmon_off)
+   return;
+   xmon_init(1);
if (xmon_early)
debugger(NULL);
 }
--
2.9.3




Thanks,


Guilherme




cheers







Re: [PATCH] powerpc/xmon: Fix an unexpected xmon onoff state change

2017-02-17 Thread Pan Xinhui



在 2017/2/17 14:05, Michael Ellerman 写道:

Pan Xinhui  writes:

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..f6e5c3d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
  #endif /* CONFIG_SMP */

  static unsigned long in_xmon __read_mostly = 0;
+static int xmon_off = !IS_ENABLED(CONFIG_XMON_DEFAULT);


I think the logic would probably clearer if we invert this to become
xmon_on.


yep, make sense.


@@ -3266,16 +3269,16 @@ static int __init setup_xmon_sysrq(void)
  __initcall(setup_xmon_sysrq);
  #endif /* CONFIG_MAGIC_SYSRQ */

-static int __initdata xmon_early, xmon_off;
+static int __initdata xmon_early;

  static int __init early_parse_xmon(char *p)
  {
if (!p || strncmp(p, "early", 5) == 0) {
/* just "xmon" is equivalent to "xmon=early" */
-   xmon_init(1);
xmon_early = 1;
+   xmon_off = 0;
} else if (strncmp(p, "on", 2) == 0)
-   xmon_init(1);
+   xmon_off = 0;


You've just changed the timing of when xmon gets enabled for the above
two cases, from here which is called very early, to xmon_setup() which
is called much later in boot.

That effectively disables xmon for most of the boot, which we do not
want to do.


Although it is not often that kernel got stucked during boot. Yes, the behavior 
changed anyway. Will fix that in v3.


cheers





Re: [PATCH v3 3/3] powerpc/xmon: add debugfs entry for xmon

2017-02-20 Thread Pan Xinhui



在 2017/2/21 09:58, Guilherme G. Piccoli 写道:

Currently the xmon debugger is set only via kernel boot command-line.
It's disabled by default, and can be enabled with "xmon=on" on the
command-line. Also, xmon may be accessed via sysrq mechanism.
But we cannot enable/disable xmon in runtime, it needs kernel reload.

This patch introduces a debugfs entry for xmon, allowing user to query
its current state and change it if desired. Basically, the "xmon" file
to read from/write to is under the debugfs mount point, on powerpc
directory. It's a simple attribute, value 0 meaning xmon is disabled
and value 1 the opposite. Writing these states to the file will take
immediate effect in the debugger.

Signed-off-by: Guilherme G. Piccoli 
---
v3: logic improved based in the changes made on patch 1.

v2: dropped the custom parser by using simple attributes [mpe suggestion].


 arch/powerpc/xmon/xmon.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index f1fcfa8..764ea62 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,6 +29,10 @@
 #include 
 #include 

+#ifdef CONFIG_DEBUG_FS
+#include 
+#endif
+
 #include 
 #include 
 #include 
@@ -3275,6 +3279,33 @@ static int __init setup_xmon_sysrq(void)
 device_initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */

+#ifdef CONFIG_DEBUG_FS
+static int xmon_dbgfs_set(void *data, u64 val)
+{
+   xmon_off = !val;

xmon_off is really 'off' :)
There is xmon_on


+   xmon_init(!xmon_off);
+
+   return 0;
+}
+
+static int xmon_dbgfs_get(void *data, u64 *val)
+{
+   *val = !xmon_off;
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
+   xmon_dbgfs_set, "%llu\n");
+
+static int __init setup_xmon_dbgfs(void)
+{
+   debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
+   &xmon_dbgfs_ops);
+   return 0;
+}
+device_initcall(setup_xmon_dbgfs);
+#endif /* CONFIG_DEBUG_FS */
+
 static int xmon_early __initdata;

 static int __init early_parse_xmon(char *p)





[PATCH v7 0/6] Implement qspinlock/pv-qspinlock on ppc

2016-09-18 Thread Pan Xinhui
   1221.9   1265.31250.4
Pipe-based Context Switching 529.8578.1 564.2
Process Creation 408.4421.6 287.6
Shell Scripts (1 concurrent)1201.8   1215.31185.8
Shell Scripts (8 concurrent)3758.4   3799.33878.9
System Call Overhead1008.3   1122.61134.2
  =
System Benchmarks Index Score   1072.0   1108.91050.6
--------


Pan Xinhui (6):
  pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock
  powerpc/qspinlock: powerpc support qspinlock
  powerpc: pseries/Kconfig: Add qspinlock build config
  powerpc: lib/locks.c: Add cpu yield/wake helper function
  powerpc/pv-qspinlock: powerpc support pv-qspinlock
  powerpc: pSeries: Add pv-qspinlock build config/make

 arch/powerpc/include/asm/qspinlock.h   |  93 +
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  36 +
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/include/asm/spinlock.h|  35 +++--
 arch/powerpc/include/asm/spinlock_types.h  |   4 +
 arch/powerpc/kernel/Makefile   |   1 +
 arch/powerpc/kernel/paravirt.c | 153 +
 arch/powerpc/lib/locks.c   | 122 
 arch/powerpc/platforms/pseries/Kconfig |   9 ++
 arch/powerpc/platforms/pseries/setup.c |   5 +
 kernel/locking/qspinlock_paravirt.h|   2 +-
 11 files changed, 459 insertions(+), 14 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

-- 
2.4.11



[PATCH v7 1/6] pv-qspinlock: use cmpxchg_release in __pv_queued_spin_unlock

2016-09-18 Thread Pan Xinhui
cmpxchg_release() is more lighweight than cmpxchg() on some archs(e.g.
PPC), moreover, in __pv_queued_spin_unlock() we only needs a RELEASE in
the fast path(pairing with *_try_lock() or *_lock()). And the slow path
has smp_store_release too. So it's safe to use cmpxchg_release here.

Suggested-by:  Boqun Feng 
Signed-off-by: Pan Xinhui 
---
 kernel/locking/qspinlock_paravirt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 8a99abf..ce655aa 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -544,7 +544,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock 
*lock)
 * unhash. Otherwise it would be possible to have multiple @lock
 * entries, which would be BAD.
 */
-   locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+   locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
if (likely(locked == _Q_LOCKED_VAL))
return;
 
-- 
2.4.11



[PATCH v7 2/6] powerpc/qspinlock: powerpc support qspinlock

2016-09-18 Thread Pan Xinhui
This patch add basic code to enable qspinlock on powerpc. qspinlock is
one kind of fairlock implemention. And seen some performance improvement
under some scenarios.

queued_spin_unlock() release the lock by just one write of NULL to the
->locked field which sits at different places in the two endianness
system.

We override some arch_spin_xxx as powerpc has io_sync stuff which makes
sure the io operations are protected by the lock correctly.

There is another special case, see commit
2c610022711 ("locking/qspinlock: Fix spin_unlock_wait() some more")

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h  | 66 +++
 arch/powerpc/include/asm/spinlock.h   | 31 +--
 arch/powerpc/include/asm/spinlock_types.h |  4 ++
 arch/powerpc/lib/locks.c  | 59 +++
 4 files changed, 147 insertions(+), 13 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock.h

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
new file mode 100644
index 000..881a186
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_POWERPC_QSPINLOCK_H
+#define _ASM_POWERPC_QSPINLOCK_H
+
+#include 
+
+#define SPIN_THRESHOLD (1 << 15)
+#define queued_spin_unlock queued_spin_unlock
+#define queued_spin_is_locked queued_spin_is_locked
+#define queued_spin_unlock_wait queued_spin_unlock_wait
+
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+
+static inline u8 * __qspinlock_lock_byte(struct qspinlock *lock)
+{
+   return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   /* release semantics is required */
+   smp_store_release(__qspinlock_lock_byte(lock), 0);
+}
+
+static inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+   smp_mb();
+   return atomic_read(&lock->val);
+}
+
+#include 
+
+/* we need override it as ppc has io_sync stuff */
+#undef arch_spin_trylock
+#undef arch_spin_lock
+#undef arch_spin_lock_flags
+#undef arch_spin_unlock
+#define arch_spin_trylock arch_spin_trylock
+#define arch_spin_lock arch_spin_lock
+#define arch_spin_lock_flags arch_spin_lock_flags
+#define arch_spin_unlock arch_spin_unlock
+
+static inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   return queued_spin_trylock(lock);
+}
+
+static inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline
+void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
+{
+   CLEAR_IO_SYNC;
+   queued_spin_lock(lock);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+   SYNC_IO;
+   queued_spin_unlock(lock);
+}
+#endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index fa37fe9..6aef8dd 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,23 @@
 #define SYNC_IO
 #endif
 
+#if defined(CONFIG_PPC_SPLPAR)
+/* We only yield to the hypervisor if we are in shared processor mode */
+#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
+extern void __spin_yield(arch_spinlock_t *lock);
+extern void __rw_yield(arch_rwlock_t *lock);
+#else /* SPLPAR */
+#define __spin_yield(x)barrier()
+#define __rw_yield(x)  barrier()
+#define SHARED_PROCESSOR   0
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include 
+#else
+
+#define arch_spin_relax(lock)  __spin_yield(lock)
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
@@ -106,18 +123,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * held.  Conveniently, we have a word in the paca that holds this
  * value.
  */
-
-#if defined(CONFIG_PPC_SPLPAR)
-/* We only yield to the hypervisor if we are in shared processor mode */
-#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-extern void __spin_yield(arch_spinlock_t *lock);
-extern void __rw_yield(arch_rwlock_t *lock);
-#else /* SPLPAR */
-#define __spin_yield(x)barrier()
-#define __rw_yield(x)  barrier()
-#define SHARED_PROCESSOR   0
-#endif
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
CLEAR_IO_SYNC;
@@ -195,6 +200,7 @@ out:
smp_mb();
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -330,7 +336,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
-#define arch_spin_relax(lock)  __spin_yield(lock)
 #define arch_read_relax(lock)  __rw_yield(lock)
 #define arch_write_relax(lock) __rw_yield(lock)
 
diff --git a/arch/powerpc

[PATCH v7 3/6] powerpc: pseries/Kconfig: Add qspinlock build config

2016-09-18 Thread Pan Xinhui
pseries will use qspinlock by default.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/platforms/pseries/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index bec90fb..f669323 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -21,6 +21,7 @@ config PPC_PSERIES
select HOTPLUG_CPU if SMP
select ARCH_RANDOM
select PPC_DOORBELL
+   select ARCH_USE_QUEUED_SPINLOCKS
default y
 
 config PPC_SPLPAR
-- 
2.4.11



[PATCH v7 4/6] powerpc: lib/locks.c: Add cpu yield/wake helper function

2016-09-18 Thread Pan Xinhui
Add two corresponding helper functions to support pv-qspinlock.

For normal use, __spin_yield_cpu will confer current vcpu slices to the
target vcpu(say, a lock holder). If target vcpu is not specified or it
is in running state, such conferging to lpar happens or not depends.

Because hcall itself will introduce latency and a little overhead. And
we do NOT want to suffer any latency on some cases, e.g. in interrupt handler.
The second parameter *confer* can indicate such case.

__spin_wake_cpu is simpiler, it will wake up one vcpu regardless of its
current vcpu state.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h |  4 +++
 arch/powerpc/lib/locks.c| 59 +
 2 files changed, 63 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 6aef8dd..abb6b0f 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -56,9 +56,13 @@
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
 extern void __spin_yield(arch_spinlock_t *lock);
+extern void __spin_yield_cpu(int cpu, int confer);
+extern void __spin_wake_cpu(int cpu);
 extern void __rw_yield(arch_rwlock_t *lock);
 #else /* SPLPAR */
 #define __spin_yield(x)barrier()
+#define __spin_yield_cpu(x,y) barrier()
+#define __spin_wake_cpu(x) barrier()
 #define __rw_yield(x)  barrier()
 #define SHARED_PROCESSOR   0
 #endif
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 6574626..892df7d 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -23,6 +23,65 @@
 #include 
 #include 
 
+/*
+ * confer our slices to a specified cpu and return. If it is already running or
+ * cpu is -1, then we will check confer. If confer is NULL, we will return
+ * otherwise we confer our slices to lpar.
+ */
+void __spin_yield_cpu(int cpu, int confer)
+{
+   unsigned int holder_cpu = cpu, yield_count;
+
+   if (cpu == -1)
+   goto yield_to_lpar;
+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+
+   /* if cpu is running, confer slices to lpar conditionally*/
+   if ((yield_count & 1) == 0)
+   goto yield_to_lpar;
+
+   plpar_hcall_norets(H_CONFER,
+   get_hard_smp_processor_id(holder_cpu), yield_count);
+   return;
+
+yield_to_lpar:
+   if (confer)
+   plpar_hcall_norets(H_CONFER, -1, 0);
+}
+EXPORT_SYMBOL_GPL(__spin_yield_cpu);
+
+void __spin_wake_cpu(int cpu)
+{
+   unsigned int holder_cpu = cpu;
+
+   BUG_ON(holder_cpu >= nr_cpu_ids);
+   /*
+* NOTE: we should always do this hcall regardless of
+* the yield_count of the holder_cpu.
+* as thers might be a case like below;
+* CPU  1   2
+*  yielded = true
+*  if (yielded)
+*  __spin_wake_cpu()
+*  __spin_yield_cpu()
+*
+* So we might lose a wake if we check the yield_count and
+* return directly if the holder_cpu is running.
+* IOW. do NOT code like below.
+*  yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count);
+*  if ((yield_count & 1) == 0)
+*  return;
+*
+* a PROD hcall marks the target_cpu proded, which cause the next cede 
or confer
+* called on the target_cpu invalid.
+*/
+   plpar_hcall_norets(H_PROD,
+   get_hard_smp_processor_id(holder_cpu));
+}
+EXPORT_SYMBOL_GPL(__spin_wake_cpu);
+
 #ifndef CONFIG_QUEUED_SPINLOCKS
 void __spin_yield(arch_spinlock_t *lock)
 {
-- 
2.4.11



[PATCH v7 5/6] powerpc/pv-qspinlock: powerpc support pv-qspinlock

2016-09-18 Thread Pan Xinhui
The default pv-qspinlock uses qspinlock(native version of pv-qspinlock).
pv_lock initialization should be done in bootstage with irq disabled.
And if we run as a guest with powerKVM/pHyp shared_processor mode,
restore pv_lock_ops callbacks to pv-qspinlock(pv version) which makes
full use of virtualization.

There is a hash table, we store cpu number into it and the key is lock.
So everytime pv_wait can know who is the lock holder by searching the
lock. Also store the lock in a per_cpu struct, and remove it when we own
the lock. Then pv_wait can know which lock we are spinning on. But the
cpu in the hash table might not be the correct lock holder, as for
performace issue, we does not take care of hash conflict.

Also introduce spin_lock_holder, which tells who owns the lock now.
currently the only user is spin_unlock_wait.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/qspinlock.h   |  29 +++-
 arch/powerpc/include/asm/qspinlock_paravirt.h  |  36 +
 .../powerpc/include/asm/qspinlock_paravirt_types.h |  13 ++
 arch/powerpc/kernel/paravirt.c | 153 +
 arch/powerpc/lib/locks.c   |   8 +-
 arch/powerpc/platforms/pseries/setup.c |   5 +
 6 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt_types.h
 create mode 100644 arch/powerpc/kernel/paravirt.c

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 881a186..23459fb 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -15,7 +15,7 @@ static inline u8 * __qspinlock_lock_byte(struct qspinlock 
*lock)
return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
 }
 
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
 {
/* release semantics is required */
smp_store_release(__qspinlock_lock_byte(lock), 0);
@@ -27,6 +27,33 @@ static inline int queued_spin_is_locked(struct qspinlock 
*lock)
return atomic_read(&lock->val);
 }
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include 
+/*
+ * try to know who is the lock holder, however it is not always true
+ * Return:
+ * -1, we did not know the lock holder.
+ * other value, likely is the lock holder.
+ */
+extern int spin_lock_holder(void *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+   pv_queued_spin_lock(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_queued_spin_unlock(lock);
+}
+#else
+#define spin_lock_holder(l) (-1)
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+   native_queued_spin_unlock(lock);
+}
+#endif
+
 #include 
 
 /* we need override it as ppc has io_sync stuff */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000..d87cda0
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,36 @@
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#error "do not include this file"
+#endif
+
+#ifndef _ASM_QSPINLOCK_PARAVIRT_H
+#define _ASM_QSPINLOCK_PARAVIRT_H
+
+#include  
+
+extern void pv_lock_init(void);
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void pv_queued_spin_lock(struct qspinlock *lock, u32 val)
+{
+   pv_lock_op.lock(lock, val);
+}
+
+static inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+   pv_lock_op.unlock(lock);
+}
+
+static inline void pv_wait(u8 *ptr, u8 val)
+{
+   pv_lock_op.wait(ptr, val);
+}
+
+static inline void pv_kick(int cpu)
+{
+   pv_lock_op.kick(cpu);
+}
+
+#endif
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt_types.h 
b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
new file mode 100644
index 000..83611ed
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt_types.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+#define _ASM_QSPINLOCK_PARAVIRT_TYPES_H
+
+struct pv_lock_ops {
+   void (*lock)(struct qspinlock *lock, u32 val);
+   void (*unlock)(struct qspinlock *lock);
+   void (*wait)(u8 *ptr, u8 val);
+   void (*kick)(int cpu);
+};
+
+extern struct pv_lock_ops pv_lock_op;
+
+#endif
diff --git a/arch/powerpc/kernel/paravirt.c b/arch/powerpc/kernel/paravirt.c
new file mode 100644
index 000..e697b17
--- /dev/null
+++ b/arch/powerpc/kernel/paravirt.c
@@ -0,0 +1,153 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * 

[PATCH v7 6/6] powerpc: pSeries: Add pv-qspinlock build config/make

2016-09-18 Thread Pan Xinhui
pSeries run as a guest and might need pv-qspinlock.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/kernel/Makefile   | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fe4c075..efd2f3d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_PPC_970_NAP) += idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)   += idle_book3s.o
 procfs-y   := proc_powerpc.o
 obj-$(CONFIG_PROC_FS)  += $(procfs-y)
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)   += paravirt.o
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)  := rtas_pci.o
 obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index f669323..46632e4 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -128,3 +128,11 @@ config HV_PERF_CTRS
  systems. 24x7 is available on Power 8 systems.
 
   If unsure, select Y.
+
+config PARAVIRT_SPINLOCKS
+   bool "Paravirtialization support for qspinlock"
+   depends on PPC_SPLPAR && QUEUED_SPINLOCKS
+   default y
+   help
+ If platform supports virtualization, for example PowerVM, this option
+ can let guest have a better performace.
-- 
2.4.11



Re: [PATCH v3 0/4] implement vcpu preempted check

2016-09-29 Thread Pan Xinhui



在 2016/9/29 18:10, Peter Zijlstra 写道:

On Thu, Jul 21, 2016 at 07:45:10AM -0400, Pan Xinhui wrote:

change from v2:
no code change, fix typos, update some comments

change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner.
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.


So I really like the concept, but I would also really like to see
support for more hypervisors included before we can move forward with
this.

Please consider s390 and (x86/arm) KVM. Once we have a few, more can
follow later, but I think its important to not only have PPC support for
this.


hi, Peter
right, we need consider more arches.
I will try to work out a RFC patch on x86 anyway. There are already some 
discussions of the solution on x86 :)

thanks
xinhui



Re: [PATCH v3 0/4] implement vcpu preempted check

2016-09-29 Thread Pan Xinhui



在 2016/9/29 18:31, Peter Zijlstra 写道:

On Thu, Sep 29, 2016 at 12:23:19PM +0200, Christian Borntraeger wrote:

On 09/29/2016 12:10 PM, Peter Zijlstra wrote:

On Thu, Jul 21, 2016 at 07:45:10AM -0400, Pan Xinhui wrote:

change from v2:
no code change, fix typos, update some comments

change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner.
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.


So I really like the concept, but I would also really like to see
support for more hypervisors included before we can move forward with
this.

Please consider s390 and (x86/arm) KVM. Once we have a few, more can
follow later, but I think its important to not only have PPC support for
this.


Actually the s390 preemted check via sigp sense running  is available for
all hypervisors (z/VM, LPAR and KVM) which implies everywhere as you can no
longer buy s390 systems without LPAR.

As Heiko already pointed out we could simply use a small inline function
that calls cpu_is_preempted from arch/s390/lib/spinlock (or smp_vcpu_scheduled 
from smp.c)


Sure, and I had vague memories of Heiko's email. This patch set however
completely fails to do that trivial hooking up.



sorry for that.
I will try to work it out on x86.

Hi, Will
I appreciate that if you or some other arm guys could help on it. :)



Re: [PATCH v3 0/4] implement vcpu preempted check

2016-09-30 Thread Pan Xinhui


hi, Paolo
thanks for your reply.

在 2016/9/30 14:58, Paolo Bonzini 写道:

Please consider s390 and (x86/arm) KVM. Once we have a few, more can
follow later, but I think its important to not only have PPC support for
this.


Actually the s390 preemted check via sigp sense running  is available for
all hypervisors (z/VM, LPAR and KVM) which implies everywhere as you can
no longer buy s390 systems without LPAR.

As Heiko already pointed out we could simply use a small inline function
that calls cpu_is_preempted from arch/s390/lib/spinlock (or
smp_vcpu_scheduled from smp.c)


Sure, and I had vague memories of Heiko's email. This patch set however
completely fails to do that trivial hooking up.


sorry for that.
I will try to work it out on x86.


x86 has no hypervisor support, and I'd like to understand the desired
semantics first, so I don't think it should block this series.  In


Once a guest do a hypercall or something similar, IOW, there is a 
kvm_guest_exit. we think this is a lock holder preemption.
Adn PPC implement it in this way.


particular, there are at least the following choices:

1) exit to userspace (5-10.000 clock cycles best case) counts as
lock holder preemption


just to avoid any misunderstanding.
You are saying that the guest does an IO operation for example and then exit to 
QEMU right?
Yes, in this scenario it's hard to guarntee that such IO operation or someghing 
like that could be finished in time.

 

2) any time the vCPU thread not running counts as lock holder
preemption

To implement the latter you'd need a hypercall or MSR (at least as
a slow path), because the KVM preempt notifier is only active
during the KVM_RUN ioctl.


seems a little expensive. :(
How many clock cycles it might cost.

I am still looking for one shared struct between kvm and guest kernel on x86.
and every time kvm_guest_exit/enter called, we store some info in it. So guest 
kernel can check one vcpu is running or not quickly.

thanks
xinhui


Paolo





Re: [PATCH v3 0/4] implement vcpu preempted check

2016-09-30 Thread Pan Xinhui



在 2016/9/30 17:08, Paolo Bonzini 写道:



On 30/09/2016 10:52, Pan Xinhui wrote:

x86 has no hypervisor support, and I'd like to understand the desired
semantics first, so I don't think it should block this series.  In


Once a guest do a hypercall or something similar, IOW, there is a
kvm_guest_exit. we think this is a lock holder preemption.
Adn PPC implement it in this way.


Ok, good.


particular, there are at least the following choices:

1) exit to userspace (5-10.000 clock cycles best case) counts as
lock holder preemption

2) any time the vCPU thread not running counts as lock holder
preemption

To implement the latter you'd need a hypercall or MSR (at least as
a slow path), because the KVM preempt notifier is only active
during the KVM_RUN ioctl.


seems a little expensive. :(
How many clock cycles it might cost.


An MSR read is about 1500 clock cycles, but it need not be the fast path
(e.g. use a bit to check if the CPU is running, if not use the MSR to
check if the CPU is in userspace but the CPU thread is scheduled).  But
it's not necessary if you are just matching PPC semantics.

Then the simplest thing is to use the kvm_steal_time struct, and add a
new field to it that replaces pad[0].  You can write a 0 to the flag in
record_steal_time (not preempted) and a 1 in kvm_arch_vcpu_put
(preempted).  record_steal_time is called before the VM starts running,
immediately after KVM_RUN and also after every sched_in.

If KVM doesn't implement the flag, it won't touch that field at all.  So
the kernel can write a 0, meaning "not preempted", and not care if the
hypervisor implements the flag or not: the answer will always be safe.

The pointer to the flag can be placed in a per-cpu u32*, and again if
the u32* is NULL that means "not preempted".


really nice suggestion!  That's what I want :)

thanks
xinhui


Paolo



I am still looking for one shared struct between kvm and guest kernel on
x86.
and every time kvm_guest_exit/enter called, we store some info in it. So
guest kernel can check one vcpu is running or not quickly.

thanks
xinhui


Paolo









[PATCH v4 1/5] kernel/sched: introduce vcpu preempted check interface

2016-10-18 Thread Pan Xinhui
This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detech if
one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu pteempted check.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 include/linux/sched.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b..44c1ce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3506,6 +3506,18 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu) false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11



[PATCH v4 0/5] implement vcpu preempted check

2016-10-18 Thread Pan Xinhui
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner. 
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements.

PPC test result:

1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch: 

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 

Pan Xinhui (5):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, kvm: support vcpu preempted check

 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/x86/include/asm/paravirt_types.h |  6 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 11 +++
 arch/x86/kernel/paravirt.c| 11 +++
 arch/x86/kvm/x86.c| 12 
 include/linux/sched.h | 12 
 kernel/locking/mutex.c| 15 +--
 kernel/locking/osq_lock.c | 10 +-
 kernel/locking/rwsem-xadd.c   | 16 +---
 11 files changed, 105 insertions(+), 7 deletions(-)

-- 
2.4.11



[PATCH v4 2/5] locking/osq: Drop the overload of osq_lock()

2016-10-18 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng 
Signed-off-by: Pan Xinhui 
---
 kernel/locking/osq_lock.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..39d1385 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+   return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
 * If we need to reschedule bail... so we can block.
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
 */
-   if (need_resched())
+   if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
 
cpu_relax_lowlatency();
-- 
2.4.11



[PATCH v4 3/5] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner

2016-10-18 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
the two spin_on_owner. This blames on the lock holder preemption issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown] [H] 0xc00768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui 
---
 kernel/locking/mutex.c  | 15 +--
 kernel/locking/rwsem-xadd.c | 16 +---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..8927e96 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,13 @@ bool mutex_spin_on_owner(struct mutex *lock, struct 
task_struct *owner)
 */
barrier();
 
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
ret = false;
break;
}
@@ -261,8 +267,13 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 
rcu_read_lock();
owner = READ_ONCE(lock->owner);
+
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
if (owner)
-   retval = owner->on_cpu;
+   retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
/*
 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..ad0b5bb 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
goto done;
}
 
-   ret = owner->on_cpu;
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
+   ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
rcu_read_unlock();
return ret;
@@ -362,8 +366,14 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
barrier();
 
-   /* abort spinning when need_resched or owner is not running */
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* abort spinning when need_resched or owner is not running or
+* owner's cpu is preempted. vcpu_is_preempted is a macro
+* defined by false if arch does not support vcpu preempted
+* check
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
-- 
2.4.11



[PATCH v4 4/5] powerpc/spinlock: support vcpu preempted check

2016-10-18 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is powerNV are built into
same kernel image with pSeries. So we need return false if we are runnig
as powerNV. The another fact is that lppaca->yiled_count keeps zero on
powerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index abb6b0f..af4285b 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-- 
2.4.11



[PATCH v4 5/5] x86, kvm: support vcpu preempted check

2016-10-18 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.  Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

We use one field of struct kvm_steal_time to indicate that if one vcpu
is running or not.

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/asm/paravirt_types.h |  6 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 11 +++
 arch/x86/kernel/paravirt.c| 11 +++
 arch/x86/kvm/x86.c| 12 
 6 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..b1c7937 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -98,6 +98,10 @@ struct pv_time_ops {
unsigned long long (*steal_clock)(int cpu);
 };
 
+struct pv_vcpu_ops {
+   bool (*vcpu_is_preempted)(int cpu);
+};
+
 struct pv_cpu_ops {
/* hooks for various privileged instructions */
unsigned long (*get_debugreg)(int regno);
@@ -318,6 +322,7 @@ struct pv_lock_ops {
 struct paravirt_patch_template {
struct pv_init_ops pv_init_ops;
struct pv_time_ops pv_time_ops;
+   struct pv_vcpu_ops pv_vcpu_ops;
struct pv_cpu_ops pv_cpu_ops;
struct pv_irq_ops pv_irq_ops;
struct pv_mmu_ops pv_mmu_ops;
@@ -327,6 +332,7 @@ struct paravirt_patch_template {
 extern struct pv_info pv_info;
 extern struct pv_init_ops pv_init_ops;
 extern struct pv_time_ops pv_time_ops;
+extern struct pv_vcpu_ops pv_vcpu_ops;
 extern struct pv_cpu_ops pv_cpu_ops;
 extern struct pv_irq_ops pv_irq_ops;
 extern struct pv_mmu_ops pv_mmu_ops;
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..52fd942 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return pv_vcpu_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include 
 
 /*
diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..e9c12a1 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,8 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u32 preempted;
+   __u32 pad[11];
 };
 
 #define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc8..0011bef 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+   struct kvm_steal_time *src;
+
+   src = &per_cpu(steal_time, cpu);
+
+   return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -488,6 +497,8 @@ void __init kvm_guest_init(void)
kvm_guest_cpu_init();
 #endif
 
+   pv_vcpu_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
+
/*
 * Hard lockup detection is enabled by default. Disable it, as guests
 * can get false positives too easily, for example if the host is
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bbf3d59..7adb7e9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -122,6 +122,7 @@ static void *get_call_destination(u8 type)
struct paravirt_patch_templa

Re: [PATCH v4 0/5] implement vcpu preempted check

2016-10-19 Thread Pan Xinhui



在 2016/10/19 14:47, Christian Borntraeger 写道:

On 10/19/2016 12:20 PM, Pan Xinhui wrote:

change from v3:
add x86 vcpu preempted check patch


If you want you could add the s390 patch that I provided for your last version.
I also gave my Acked-by for all previous patches.


hi, Christian
Thanks a lot!
I can include your new s390 patch into my next patchset(if v5 is needed).

xinhui





change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner.
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements.

PPC test result:

1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Pan Xinhui (5):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, kvm: support vcpu preempted check

 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/x86/include/asm/paravirt_types.h |  6 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 11 +++
 arch/x86/kernel/paravirt.c| 11 +++
 arch/x86/kvm/x86.c| 12 
 include/linux/sched.h | 12 
 kernel/locking/mutex.c| 15 +--
 kernel/locking/osq_lock.c | 10 +-
 kernel/locking/rwsem-xadd.c   | 16 +---
 11 files changed, 105 insertions(+), 7 deletions(-)







Re: [PATCH v4 0/5] implement vcpu preempted check

2016-10-19 Thread Pan Xinhui


在 2016/10/19 23:58, Juergen Gross 写道:

On 19/10/16 12:20, Pan Xinhui wrote:

change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner.
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements.

PPC test result:

1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Pan Xinhui (5):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, kvm: support vcpu preempted check


The attached patch adds Xen support for x86. Please tell me whether you
want to add this patch to your series or if I should post it when your
series has been accepted.


hi, Juergen
Your patch is pretty small and nice :) thanks!
I can include your patch into my next patchset after this patchset reviewed. :)



You can add my

Tested-by: Juergen Gross 

for patches 1-3 and 5 (paravirt parts only).



Thanks a lot!

xinhui



Juergen



 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/x86/include/asm/paravirt_types.h |  6 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 11 +++
 arch/x86/kernel/paravirt.c| 11 +++
 arch/x86/kvm/x86.c| 12 
 include/linux/sched.h | 12 
 kernel/locking/mutex.c| 15 +--
 kernel/locking/osq_lock.c | 10 +-
 kernel/locking/rwsem-xadd.c   | 16 +---
 11 files changed, 105 insertions(+), 7 deletions(-)







Re: [PATCH v4 5/5] x86, kvm: support vcpu preempted check

2016-10-19 Thread Pan Xinhui


在 2016/10/20 01:24, Radim Krčmář 写道:

2016-10-19 06:20-0400, Pan Xinhui:

This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.  Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

We use one field of struct kvm_steal_time to indicate that if one vcpu
is running or not.

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Signed-off-by: Pan Xinhui 
---
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
@@ -98,6 +98,10 @@ struct pv_time_ops {
unsigned long long (*steal_clock)(int cpu);
 };

+struct pv_vcpu_ops {
+   bool (*vcpu_is_preempted)(int cpu);
+};
+


(I would put it into pv_lock_ops to save the plumbing.)


hi, Radim
thanks for your reply.

yes, a new struct leads patch into unnecessary lines changed.
I do that just because I am not sure which existing xxx_ops I should place the 
vcpu_is_preempted in.


diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,8 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u32 preempted;


Why __u32 instead of __u8?


I thought it is 32-bits aligned...
yes, u8 is good to store the preempt status.


+   __u32 pad[11];
 };


Please document the change in Documentation/virtual/kvm/msr.txt, section
MSR_KVM_STEAL_TIME.


okay, I totally forgot to do that. thanks!


diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+   struct kvm_steal_time *src;
+
+   src = &per_cpu(steal_time, cpu);
+
+   return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -488,6 +497,8 @@ void __init kvm_guest_init(void)
kvm_guest_cpu_init();
 #endif

+   pv_vcpu_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;


Would be nicer to assign conditionally in the KVM_FEATURE_STEAL_TIME
block.  The steal_time structure has to be zeroed, so this code would
work, but the native function (return false) is better if we know that
the kvm_vcpu_is_preempted() would always return false anway.


yes, agree. Will do that.

I once thought we can patch the code runtime.
we replace binary code
"call 0x #pv_vcpu_ops.vcpu_is_preempted"
with
"xor eax, eax"
however it is not worth doing that. the performace improvements might be very 
small.


Old KVMs won't have the feature, so we could also assign only when KVM
reports it, but that requires extra definitions and the performance gain
is fairly small, so I'm ok with this.


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time
return;

+   vcpu->arch.st.steal.preempted = 0;
+
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1;  /* first time write, random 
junk */

@@ -2812,6 +2814,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+   if (vcpu->arch.st.msr_val & KVM_MSR_ENABLED)
+   if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal,
+   sizeof(struct kvm_steal_time)) == 0) {
+   vcpu->arch.st.steal.preempted = 1;
+   kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal,
+   sizeof(struct kvm_steal_time));
+   }


Please name t

[PATCH v5 0/9] implement vcpu preempted check

2016-10-20 Thread Pan Xinhui
change from v4:
spilt x86 kvm vcpu preempted check into two patches.
add documentation patch.
add x86 vcpu preempted check patch under xen
add s390 vcpu preempted check patch 
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner. 
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements in uninx benchmark tests.

PPC test result:
1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch: 

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 

Christian Borntraeger (1):
  s390/spinlock: Provide vcpu_is_preempted

Juergen Gross (1):
  x86, xen: support vcpu preempted check

Pan Xinhui (7):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, paravirt: Add interface to support kvm/xen vcpu preempted check
  x86, kvm: support vcpu preempted check
  Documentation: virtual: kvm: Support vcpu preempted check

 Documentation/virtual/kvm/msr.txt |  8 +++-
 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/s390/include/asm/spinlock.h  |  8 
 arch/s390/kernel/smp.c|  9 +++--
 arch/s390/lib/spinlock.c  | 25 -
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 12 
 arch/x86/kernel/paravirt-spinlocks.c  |  6 ++
 arch/x86/kvm/x86.c| 18 ++
 arch/x86/xen/spinlock.c   |  3 ++-
 include/linux/sched.h | 12 
 kernel/locking/mutex.c| 15 +--
 kernel/locking/osq_lock.c | 10 +-
 kernel/locking/rwsem-xadd.c   | 16 +---
 16 files changed, 135 insertions(+), 28 deletions(-)

-- 
2.4.11



[PATCH v5 1/9] kernel/sched: introduce vcpu preempted check interface

2016-10-20 Thread Pan Xinhui
This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detech if
one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu pteempted check.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 include/linux/sched.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b..44c1ce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3506,6 +3506,18 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu) false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11



[PATCH v5 2/9] locking/osq: Drop the overload of osq_lock()

2016-10-20 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/osq_lock.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..39d1385 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+   return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
 * If we need to reschedule bail... so we can block.
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
 */
-   if (need_resched())
+   if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
 
cpu_relax_lowlatency();
-- 
2.4.11



[PATCH v5 3/9] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner

2016-10-20 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
the two spin_on_owner. This blames on the lock holder preemption issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown] [H] 0xc00768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/mutex.c  | 15 +--
 kernel/locking/rwsem-xadd.c | 16 +---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..82108f5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,13 @@ bool mutex_spin_on_owner(struct mutex *lock, struct 
task_struct *owner)
 */
barrier();
 
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
ret = false;
break;
}
@@ -261,8 +267,13 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 
rcu_read_lock();
owner = READ_ONCE(lock->owner);
+
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
if (owner)
-   retval = owner->on_cpu;
+   retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
/*
 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..0897179 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
goto done;
}
 
-   ret = owner->on_cpu;
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
+   ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
rcu_read_unlock();
return ret;
@@ -362,8 +366,14 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
barrier();
 
-   /* abort spinning when need_resched or owner is not running */
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* abort spinning when need_resched or owner is not running or
+* owner's cpu is preempted. vcpu_is_preempted is a macro
+* defined by false if arch does not support vcpu preempted
+* check
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
-- 
2.4.11



[PATCH v5 4/9] powerpc/spinlock: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is powerNV are built into
same kernel image with pSeries. So we need return false if we are runnig
as powerNV. The another fact is that lppaca->yiled_count keeps zero on
powerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index abb6b0f..f4a9524 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-- 
2.4.11



[PATCH v5 5/9] x86, paravirt: Add interface to support kvm/xen vcpu preempted check

2016-10-20 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.
Then kernel can break the spin loops upon on the retval of
vcpu_is_preempted.

As kernel has used this interface, So lets support it.

To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
pv_lock_ops.

Then kvm or xen could provide their own implementation to support
vcpu_is_preempted.

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/asm/paravirt_types.h | 2 ++
 arch/x86/include/asm/spinlock.h   | 8 
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -310,6 +310,8 @@ struct pv_lock_ops {
 
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
+
+   bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include 
 
 /*
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a00..2f204dd 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
 }
 
+static bool native_vcpu_is_preempted(int cpu)
+{
+   return 0;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+   .vcpu_is_preempted = native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-- 
2.4.11



[PATCH v5 6/9] x86, kvm: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

Use one field of struct kvm_steal_time to indicate that if one vcpu
is running or not.

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/uapi/asm/kvm_para.h |  3 ++-
 arch/x86/kernel/kvm.c| 12 
 arch/x86/kvm/x86.c   | 18 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..b3fec56 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,8 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8 preempted;
+   __u32 pad[11];
 };
 
 #define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc8..0b48dd2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+   struct kvm_steal_time *src;
+
+   src = &per_cpu(steal_time, cpu);
+
+   return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -471,6 +480,9 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
pv_time_ops.steal_clock = kvm_steal_clock;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+   pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
+#endif
}
 
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c633de..a627537 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time
return;
 
+   vcpu->arch.st.steal.preempted = 0;
+
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1;  /* first time write, random 
junk */
 
@@ -2810,8 +2812,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+   return;
+
+   if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal, sizeof(struct kvm_steal_time
+   return;
+
+   vcpu->arch.st.steal.preempted = 1;
+
+   kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+   kvm_steal_time_set_preempted(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
-- 
2.4.11



[PATCH v5 7/9] x86, xen: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
From: Juergen Gross 

Support the vcpu_is_preempted() functionality under Xen. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

A quick test (4 vcpus on 1 physical cpu doing a parallel build job
with "make -j 8") reduced system time by about 5% with this patch.

Signed-off-by: Juergen Gross 
Signed-off-by: Pan Xinhui 
---
 arch/x86/xen/spinlock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e006..74756bb 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
 }
 
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.queued_spin_unlock = 
PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
+
+   pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 /*
-- 
2.4.11



[PATCH v5 8/9] s390/spinlock: Provide vcpu_is_preempted

2016-10-20 Thread Pan Xinhui
From: Christian Borntraeger 

this implements the s390 backend for commit
"kernel/sched: introduce vcpu preempted check interface"
by reworking the existing smp_vcpu_scheduled into
arch_vcpu_is_preempted. We can then also get rid of the
local cpu_is_preempted function by moving the
CIF_ENABLED_WAIT test into arch_vcpu_is_preempted.

Signed-off-by: Christian Borntraeger 
Acked-by: Heiko Carstens 
---
 arch/s390/include/asm/spinlock.h |  8 
 arch/s390/kernel/smp.c   |  9 +++--
 arch/s390/lib/spinlock.c | 25 -
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 7e9e09f..7ecd890 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, 
unsigned int new)
return __sync_bool_compare_and_swap(lock, old, new);
 }
 
+#ifndef CONFIG_SMP
+static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
+#else
+bool arch_vcpu_is_preempted(int cpu);
+#endif
+
+#define vcpu_is_preempted arch_vcpu_is_preempted
+
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 35531fe..b988ed1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -368,10 +368,15 @@ int smp_find_processor_id(u16 address)
return -1;
 }
 
-int smp_vcpu_scheduled(int cpu)
+bool arch_vcpu_is_preempted(int cpu)
 {
-   return pcpu_running(pcpu_devices + cpu);
+   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
+   return false;
+   if (pcpu_running(pcpu_devices + cpu))
+   return false;
+   return true;
 }
+EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
 void smp_yield_cpu(int cpu)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index e5f50a7..e48a48e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int 
*lock, unsigned int old)
asm(".insn rsy,0xeb22,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
-static inline int cpu_is_preempted(int cpu)
-{
-   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
-   return 0;
-   if (smp_vcpu_scheduled(cpu))
-   return 0;
-   return 1;
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
unsigned int cpu = SPINLOCK_LOCKVAL;
@@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
continue;
}
/* First iteration: check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned 
long flags)
continue;
}
/* Check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, 
unsigned long flags)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if (owner && arch_vcpu_is_preempted(~owner))
smp_yield_cpu(~owner);
count = spin_retry;
}
@@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int 
prev)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if

[PATCH v5 9/9] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-20 Thread Pan Xinhui
Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
preempted. Other values means the vcpu has been preempted.

Signed-off-by: Pan Xinhui 
---
 Documentation/virtual/kvm/msr.txt | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..3376f13 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,8 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u32 pad[11];
}
 
whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +233,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.
 
+   preempted: indicate the VCPU who owns this struct is running or
+   not. Non-zero values mean the VCPU has been preempted. Zero
+   means the VCPU is not preempted. NOTE, it is always zero if the
+   the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled.  Bit 1 is reserved and must be zero.  When PV end of
-- 
2.4.11



Re: [PATCH v5 9/9] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-20 Thread Pan Xinhui



在 2016/10/21 09:23, Boqun Feng 写道:

On Thu, Oct 20, 2016 at 05:27:54PM -0400, Pan Xinhui wrote:

Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
preempted. Other values means the vcpu has been preempted.

  ^
s/preempted/not preempted


yes. the less of *not* definitely sould be avoided..

And better to fix other typos in the commit log ;-)
Maybe you can try aspell? That works for me.


I will try it. :)


Regards,
Boqun



Signed-off-by: Pan Xinhui 
---
 Documentation/virtual/kvm/msr.txt | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..3376f13 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,8 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u32 pad[11];
}

whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +233,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.

+   preempted: indicate the VCPU who owns this struct is running or
+   not. Non-zero values mean the VCPU has been preempted. Zero
+   means the VCPU is not preempted. NOTE, it is always zero if the
+   the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled.  Bit 1 is reserved and must be zero.  When PV end of
--
2.4.11





Re: [PATCH v4 5/5] x86, kvm: support vcpu preempted check

2016-10-24 Thread Pan Xinhui



在 2016/10/24 23:18, Paolo Bonzini 写道:



On 24/10/2016 17:14, Radim Krčmář wrote:

2016-10-24 16:39+0200, Paolo Bonzini:

On 19/10/2016 19:24, Radim Krčmář wrote:

+   if (vcpu->arch.st.msr_val & KVM_MSR_ENABLED)
+   if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal,
+   sizeof(struct kvm_steal_time)) == 0) {
+   vcpu->arch.st.steal.preempted = 1;
+   kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal,
+   sizeof(struct kvm_steal_time));
+   }

Please name this block of code.  Something like
  kvm_steal_time_set_preempted(vcpu);


While at it:

1) the kvm_read_guest_cached is not necessary.  You can rig the call to
kvm_write_guest_cached so that it only writes vcpu->arch.st.steal.preempted.


I agree.  kvm_write_guest_cached() always writes from offset 0, so we'd
want a new function that allows to specify a starting offset.


Yeah, let's leave it for a follow-up then!


I think I can make a having-offset version. :)


Thanks,

Paolo


Using cached vcpu->arch.st.steal to avoid the read wouldn't be as good.







[PATCH v6 00/11] implement vcpu preempted check

2016-10-27 Thread Pan Xinhui
change from v5:
spilt x86/kvm patch into guest/host part.
introduce kvm_write_guest_offset_cached.
fix some typos.
rebase patch onto 4.9.2
change from v4:
spilt x86 kvm vcpu preempted check into two patches.
add documentation patch.
add x86 vcpu preempted check patch under xen
add s390 vcpu preempted check patch 
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner. 
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements in uninx benchmark tests.

PPC test result:
1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch: 

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 

Christian Borntraeger (1):
  s390/spinlock: Provide vcpu_is_preempted

Juergen Gross (1):
  x86, xen: support vcpu preempted check

Pan Xinhui (9):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, paravirt: Add interface to support kvm/xen vcpu preempted check
  KVM: Introduce kvm_write_guest_offset_cached
  x86, kvm/x86.c: support vcpu preempted check
  x86, kernel/kvm.c: support vcpu preempted check
  Documentation: virtual: kvm: Support vcpu preempted check

 Documentation/virtual/kvm/msr.txt |  9 -
 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/s390/include/asm/spinlock.h  |  8 
 arch/s390/kernel/smp.c|  9 +++--
 arch/s390/lib/spinlock.c  | 25 -
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  4 +++-
 arch/x86/kernel/kvm.c | 12 
 arch/x86/kernel/paravirt-spinlocks.c  |  6 ++
 arch/x86/kvm/x86.c| 16 
 arch/x86/xen/spinlock.c   |  3 ++-
 include/linux/kvm_host.h  |  2 ++
 include/linux/sched.h

[PATCH v6 01/11] kernel/sched: introduce vcpu preempted check interface

2016-10-27 Thread Pan Xinhui
This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detech if
one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu pteempted check.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 include/linux/sched.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b..44c1ce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3506,6 +3506,18 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu) false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11



[PATCH v6 02/11] locking/osq: Drop the overload of osq_lock()

2016-10-27 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/osq_lock.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..39d1385 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+   return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
 * If we need to reschedule bail... so we can block.
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
 */
-   if (need_resched())
+   if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
 
cpu_relax_lowlatency();
-- 
2.4.11



[PATCH v6 03/11] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner

2016-10-27 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
the two spin_on_owner. This blames on the lock holder preemption issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown] [H] 0xc00768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/mutex.c  | 15 +--
 kernel/locking/rwsem-xadd.c | 16 +---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..82108f5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,13 @@ bool mutex_spin_on_owner(struct mutex *lock, struct 
task_struct *owner)
 */
barrier();
 
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
ret = false;
break;
}
@@ -261,8 +267,13 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 
rcu_read_lock();
owner = READ_ONCE(lock->owner);
+
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
if (owner)
-   retval = owner->on_cpu;
+   retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
/*
 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..0897179 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
goto done;
}
 
-   ret = owner->on_cpu;
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
+   ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
rcu_read_unlock();
return ret;
@@ -362,8 +366,14 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
barrier();
 
-   /* abort spinning when need_resched or owner is not running */
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* abort spinning when need_resched or owner is not running or
+* owner's cpu is preempted. vcpu_is_preempted is a macro
+* defined by false if arch does not support vcpu preempted
+* check
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
-- 
2.4.11



[PATCH v6 04/11] powerpc/spinlock: support vcpu preempted check

2016-10-27 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is powerNV are built into
same kernel image with pSeries. So we need return false if we are runnig
as powerNV. The another fact is that lppaca->yiled_count keeps zero on
powerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index fa37fe9..8c1b913 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
-- 
2.4.11



[PATCH v6 05/11] s390/spinlock: Provide vcpu_is_preempted

2016-10-27 Thread Pan Xinhui
From: Christian Borntraeger 

this implements the s390 backend for commit
"kernel/sched: introduce vcpu preempted check interface"
by reworking the existing smp_vcpu_scheduled into
arch_vcpu_is_preempted. We can then also get rid of the
local cpu_is_preempted function by moving the
CIF_ENABLED_WAIT test into arch_vcpu_is_preempted.

Signed-off-by: Christian Borntraeger 
Acked-by: Heiko Carstens 
---
 arch/s390/include/asm/spinlock.h |  8 
 arch/s390/kernel/smp.c   |  9 +++--
 arch/s390/lib/spinlock.c | 25 -
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 7e9e09f..7ecd890 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, 
unsigned int new)
return __sync_bool_compare_and_swap(lock, old, new);
 }
 
+#ifndef CONFIG_SMP
+static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
+#else
+bool arch_vcpu_is_preempted(int cpu);
+#endif
+
+#define vcpu_is_preempted arch_vcpu_is_preempted
+
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 35531fe..b988ed1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -368,10 +368,15 @@ int smp_find_processor_id(u16 address)
return -1;
 }
 
-int smp_vcpu_scheduled(int cpu)
+bool arch_vcpu_is_preempted(int cpu)
 {
-   return pcpu_running(pcpu_devices + cpu);
+   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
+   return false;
+   if (pcpu_running(pcpu_devices + cpu))
+   return false;
+   return true;
 }
+EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
 void smp_yield_cpu(int cpu)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index e5f50a7..e48a48e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int 
*lock, unsigned int old)
asm(".insn rsy,0xeb22,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
-static inline int cpu_is_preempted(int cpu)
-{
-   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
-   return 0;
-   if (smp_vcpu_scheduled(cpu))
-   return 0;
-   return 1;
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
unsigned int cpu = SPINLOCK_LOCKVAL;
@@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
continue;
}
/* First iteration: check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned 
long flags)
continue;
}
/* Check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, 
unsigned long flags)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if (owner && arch_vcpu_is_preempted(~owner))
smp_yield_cpu(~owner);
count = spin_retry;
}
@@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int 
prev)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if

[PATCH v6 06/11] x86, paravirt: Add interface to support kvm/xen vcpu preempted check

2016-10-27 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.
Then kernel can break the spin loops upon on the retval of
vcpu_is_preempted.

As kernel has used this interface, So lets support it.

To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
pv_lock_ops.

Then kvm or xen could provide their own implementation to support
vcpu_is_preempted.

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/asm/paravirt_types.h | 2 ++
 arch/x86/include/asm/spinlock.h   | 8 
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -310,6 +310,8 @@ struct pv_lock_ops {
 
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
+
+   bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include 
 
 /*
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a00..2f204dd 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
 }
 
+static bool native_vcpu_is_preempted(int cpu)
+{
+   return 0;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+   .vcpu_is_preempted = native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-- 
2.4.11



[PATCH v6 07/11] KVM: Introduce kvm_write_guest_offset_cached

2016-10-27 Thread Pan Xinhui
It allows us to update some status or field of one struct partially.

We can also save one kvm_read_guest_cached if we just update one filed
of the struct regardless of its current value.

Signed-off-by: Pan Xinhui 
---
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c  | 20 ++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01c0b9c..6f00237 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -645,6 +645,8 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void 
*data,
unsigned long len);
 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
   void *data, unsigned long len);
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache 
*ghc,
+  void *data, int offset, unsigned long len);
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2907b7b..95308ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1972,30 +1972,38 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct 
gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-  void *data, unsigned long len)
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache 
*ghc,
+  void *data, int offset, unsigned long len)
 {
struct kvm_memslots *slots = kvm_memslots(kvm);
int r;
+   gpa_t gpa = ghc->gpa + offset;
 
-   BUG_ON(len > ghc->len);
+   BUG_ON(len + offset > ghc->len);
 
if (slots->generation != ghc->generation)
kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
 
if (unlikely(!ghc->memslot))
-   return kvm_write_guest(kvm, ghc->gpa, data, len);
+   return kvm_write_guest(kvm, gpa, data, len);
 
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
 
-   r = __copy_to_user((void __user *)ghc->hva, data, len);
+   r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
if (r)
return -EFAULT;
-   mark_page_dirty_in_slot(ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+   mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
 
return 0;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+  void *data, unsigned long len)
+{
+   return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+}
 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-- 
2.4.11



[PATCH v6 08/11] x86, kvm/x86.c: support vcpu preempted check

2016-10-27 Thread Pan Xinhui
Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

Use one field of struct kvm_steal_time ::preempted to indicate that if
one vcpu is running or not.

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/uapi/asm/kvm_para.h |  4 +++-
 arch/x86/kvm/x86.c   | 16 
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..1421a65 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,9 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u8  u8_pad[3];
+   __u32 pad[11];
 };
 
 #define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e375235..f06e115 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time
return;
 
+   vcpu->arch.st.steal.preempted = 0;
+
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1;  /* first time write, random 
junk */
 
@@ -2810,8 +2812,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+   return;
+
+   vcpu->arch.st.steal.preempted = 1;
+
+   kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal.preempted,
+   offsetof(struct kvm_steal_time, preempted),
+   sizeof(vcpu->arch.st.steal.preempted));
+}
+
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+   kvm_steal_time_set_preempted(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
-- 
2.4.11



[PATCH v6 09/11] x86, kernel/kvm.c: support vcpu preempted check

2016-10-27 Thread Pan Xinhui
Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

struct kvm_steal_time::preempted indicate that if one vcpu is running or
not after commit("x86, kvm/x86.c: support vcpu preempted check").

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Signed-off-by: Pan Xinhui 
---
 arch/x86/kernel/kvm.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc8..0b48dd2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+   struct kvm_steal_time *src;
+
+   src = &per_cpu(steal_time, cpu);
+
+   return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -471,6 +480,9 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
pv_time_ops.steal_clock = kvm_steal_clock;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+   pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
+#endif
}
 
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
-- 
2.4.11



[PATCH v6 10/11] x86, xen: support vcpu preempted check

2016-10-27 Thread Pan Xinhui
From: Juergen Gross 

Support the vcpu_is_preempted() functionality under Xen. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

A quick test (4 vcpus on 1 physical cpu doing a parallel build job
with "make -j 8") reduced system time by about 5% with this patch.

Signed-off-by: Juergen Gross 
Signed-off-by: Pan Xinhui 
---
 arch/x86/xen/spinlock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e006..74756bb 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
 }
 
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.queued_spin_unlock = 
PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
+
+   pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 /*
-- 
2.4.11



[PATCH v6 11/11] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-27 Thread Pan Xinhui
Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
not preempted. Other values means the vcpu has been preempted.

Signed-off-by: Pan Xinhui 
Acked-by: Radim Krčmář 
---
 Documentation/virtual/kvm/msr.txt | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..ab2ab76 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,9 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u8  u8_pad[3];
+   __u32 pad[11];
}
 
whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +234,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.
 
+   preempted: indicate the VCPU who owns this struct is running or
+   not. Non-zero values mean the VCPU has been preempted. Zero
+   means the VCPU is not preempted. NOTE, it is always zero if the
+   the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled.  Bit 1 is reserved and must be zero.  When PV end of
-- 
2.4.11



Re: [Xen-devel] [PATCH v6 10/11] x86, xen: support vcpu preempted check

2016-10-28 Thread Pan Xinhui



在 2016/10/29 03:43, Konrad Rzeszutek Wilk 写道:

On Fri, Oct 28, 2016 at 04:11:26AM -0400, Pan Xinhui wrote:

From: Juergen Gross 

Support the vcpu_is_preempted() functionality under Xen. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

A quick test (4 vcpus on 1 physical cpu doing a parallel build job
with "make -j 8") reduced system time by about 5% with this patch.

Signed-off-by: Juergen Gross 
Signed-off-by: Pan Xinhui 
---
 arch/x86/xen/spinlock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e006..74756bb 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
 }

-


Spurious change.

well, just remove one unnecessary blank line while at it.


 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.queued_spin_unlock = 
PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
+
+   pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }

 /*
--
2.4.11


___
Xen-devel mailing list
xen-de...@lists.xen.org
https://lists.xen.org/xen-devel






Re: [Xen-devel] [PATCH v6 00/11] implement vcpu preempted check

2016-10-28 Thread Pan Xinhui



在 2016/10/29 03:38, Konrad Rzeszutek Wilk 写道:

On Fri, Oct 28, 2016 at 04:11:16AM -0400, Pan Xinhui wrote:

change from v5:
spilt x86/kvm patch into guest/host part.
introduce kvm_write_guest_offset_cached.
fix some typos.
rebase patch onto 4.9.2
change from v4:
spilt x86 kvm vcpu preempted check into two patches.
add documentation patch.
add x86 vcpu preempted check patch under xen
add s390 vcpu preempted check patch
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner.
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.


Do you have a git tree with these patches?


Currently no, sorry :(

I make a tar file for this patcheset. Maybe a little easier to apply :)

thanks
xinhui



test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements in uninx benchmark tests.

PPC test result:
1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Christian Borntraeger (1):
  s390/spinlock: Provide vcpu_is_preempted

Juergen Gross (1):
  x86, xen: support vcpu preempted check

Pan Xinhui (9):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, paravirt: Add interface to support kvm/xen vcpu preempted check
  KVM: Introduce kvm_write_guest_offset_cached
  x86, kvm/x86.c: support vcpu preempted check
  x86, kernel/kvm.c: support vcpu preempted check
  Documentation: virtual: kvm: Support vcpu preempted check

 Documentation/virtual/kvm/msr.txt |  9 -
 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/s390/include/asm/spinlock.h  |  8 
 arch/s390/kernel/smp.c|  9 +++--
 arch/s390/lib/spinlock.c  | 25 -
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  4 +++-
 arch/x86/kernel/kvm.c | 12 +

Re: [PATCH v6 02/11] locking/osq: Drop the overload of osq_lock()

2016-10-30 Thread Pan Xinhui



在 2016/10/30 00:52, Davidlohr Bueso 写道:

On Fri, 28 Oct 2016, Pan Xinhui wrote:

/*
 * If we need to reschedule bail... so we can block.
+ * Use vcpu_is_preempted to detech lock holder preemption issue

   ^^ detect

ok. thanks for poingting it out.

+ * and break.


Could you please remove the rest of this comment? Its just noise to point out
that vcpu_is_preempted is a macro defined by arch/false. This is standard 
protocol
in the kernel.


fair enough.


Same goes for all locks you change with this.

Thanks,
Davidlohr


   * vcpu_is_preempted is a macro defined by false if
+ * arch does not support vcpu preempted check,
 */
-if (need_resched())
+if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;

cpu_relax_lowlatency();
--
2.4.11







[PATCH v7 00/11] implement vcpu preempted check

2016-11-01 Thread Pan Xinhui
change from v6:
fix typos and remove uncessary comments.
change from v5:
spilt x86/kvm patch into guest/host part.
introduce kvm_write_guest_offset_cached.
fix some typos.
rebase patch onto 4.9.2
change from v4:
spilt x86 kvm vcpu preempted check into two patches.
add documentation patch.
add x86 vcpu preempted check patch under xen
add s390 vcpu preempted check patch 
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner. 
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements in uninx benchmark tests.

PPC test result:
1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch: 

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 


Christian Borntraeger (1):
  s390/spinlock: Provide vcpu_is_preempted

Juergen Gross (1):
  x86, xen: support vcpu preempted check

Pan Xinhui (9):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, paravirt: Add interface to support kvm/xen vcpu preempted check
  KVM: Introduce kvm_write_guest_offset_cached
  x86, kvm/x86.c: support vcpu preempted check
  x86, kernel/kvm.c: support vcpu preempted check
  Documentation: virtual: kvm: Support vcpu preempted check

 Documentation/virtual/kvm/msr.txt |  9 -
 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/s390/include/asm/spinlock.h  |  8 
 arch/s390/kernel/smp.c|  9 +++--
 arch/s390/lib/spinlock.c  | 25 -
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  4 +++-
 arch/x86/kernel/kvm.c | 12 
 arch/x86/kernel/paravirt-spinlocks.c  |  6 ++
 arch/x86/kvm/x86.c| 16 
 arch/x86/xen/spinlock.c   |  3 ++-
 include/lin

[PATCH v7 01/11] kernel/sched: introduce vcpu preempted check interface

2016-11-01 Thread Pan Xinhui
This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detect
if one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu preempted check.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Acked-by: Paolo Bonzini 
Tested-by: Juergen Gross 
---
 include/linux/sched.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b..44c1ce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3506,6 +3506,18 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu) false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11



[PATCH v7 02/11] locking/osq: Drop the overload of osq_lock()

2016-11-01 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload
in osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait
per_cpu node->locked to be set. IOW, vCPU B wait vCPU A to run and
unlock the osq lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to detect if a
vCPU is currently running or not. So break the spin loops on true
condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Acked-by: Paolo Bonzini 
Tested-by: Juergen Gross 
---
 kernel/locking/osq_lock.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..091f97f 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+   return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,9 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
 * If we need to reschedule bail... so we can block.
+* Use vcpu_is_preempted to detect lock holder preemption issue.
 */
-   if (need_resched())
+   if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
 
cpu_relax_lowlatency();
-- 
2.4.11



[PATCH v7 03/11] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner

2016-11-01 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload
in the two spin_on_owner. This blames on the lock holder preemption
issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU
is currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown] [H] 0xc00768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Acked-by: Paolo Bonzini 
Tested-by: Juergen Gross 
---
 kernel/locking/mutex.c  | 13 +++--
 kernel/locking/rwsem-xadd.c | 14 +++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..24face6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct 
task_struct *owner)
 */
barrier();
 
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* Use vcpu_is_preempted to detect lock holder preemption issue.
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
ret = false;
break;
}
@@ -261,8 +265,13 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 
rcu_read_lock();
owner = READ_ONCE(lock->owner);
+
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
if (owner)
-   retval = owner->on_cpu;
+   retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
/*
 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..b664ce1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
goto done;
}
 
-   ret = owner->on_cpu;
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
+   ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
rcu_read_unlock();
return ret;
@@ -362,8 +366,12 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
barrier();
 
-   /* abort spinning when need_resched or owner is not running */
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* abort spinning when need_resched or owner is not running or
+* owner's cpu is preempted.
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
-- 
2.4.11



[PATCH v7 04/11] powerpc/spinlock: support vcpu preempted check

2016-11-01 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is PowerNV is built into same
kernel image with pSeries. So we need return false if we are runnig as
PowerNV. The another fact is that lppaca->yiled_count keeps zero on
PowerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index fa37fe9..8c1b913 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
return lock.slock == 0;
-- 
2.4.11



[PATCH v7 05/11] s390/spinlock: Provide vcpu_is_preempted

2016-11-01 Thread Pan Xinhui
From: Christian Borntraeger 

this implements the s390 backend for commit
"kernel/sched: introduce vcpu preempted check interface"
by reworking the existing smp_vcpu_scheduled into
arch_vcpu_is_preempted. We can then also get rid of the
local cpu_is_preempted function by moving the
CIF_ENABLED_WAIT test into arch_vcpu_is_preempted.

Signed-off-by: Christian Borntraeger 
Acked-by: Heiko Carstens 
---
 arch/s390/include/asm/spinlock.h |  8 
 arch/s390/kernel/smp.c   |  9 +++--
 arch/s390/lib/spinlock.c | 25 -
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 7e9e09f..7ecd890 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, 
unsigned int new)
return __sync_bool_compare_and_swap(lock, old, new);
 }
 
+#ifndef CONFIG_SMP
+static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
+#else
+bool arch_vcpu_is_preempted(int cpu);
+#endif
+
+#define vcpu_is_preempted arch_vcpu_is_preempted
+
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 35531fe..b988ed1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -368,10 +368,15 @@ int smp_find_processor_id(u16 address)
return -1;
 }
 
-int smp_vcpu_scheduled(int cpu)
+bool arch_vcpu_is_preempted(int cpu)
 {
-   return pcpu_running(pcpu_devices + cpu);
+   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
+   return false;
+   if (pcpu_running(pcpu_devices + cpu))
+   return false;
+   return true;
 }
+EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
 void smp_yield_cpu(int cpu)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index e5f50a7..e48a48e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int 
*lock, unsigned int old)
asm(".insn rsy,0xeb22,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
-static inline int cpu_is_preempted(int cpu)
-{
-   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
-   return 0;
-   if (smp_vcpu_scheduled(cpu))
-   return 0;
-   return 1;
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
unsigned int cpu = SPINLOCK_LOCKVAL;
@@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
continue;
}
/* First iteration: check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned 
long flags)
continue;
}
/* Check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, 
unsigned long flags)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if (owner && arch_vcpu_is_preempted(~owner))
smp_yield_cpu(~owner);
count = spin_retry;
}
@@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int 
prev)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if

[PATCH v7 06/11] x86, paravirt: Add interface to support kvm/xen vcpu preempted check

2016-11-01 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.
Then kernel can break the spin loops upon the retval of
vcpu_is_preempted.

As kernel has used this interface, So lets support it.

To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
pv_lock_ops.

Then kvm or xen could provide their own implementation to support
vcpu_is_preempted.

Signed-off-by: Pan Xinhui 
Acked-by: Paolo Bonzini 
---
 arch/x86/include/asm/paravirt_types.h | 2 ++
 arch/x86/include/asm/spinlock.h   | 8 
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -310,6 +310,8 @@ struct pv_lock_ops {
 
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
+
+   bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include 
 
 /*
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a00..2f204dd 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
 }
 
+static bool native_vcpu_is_preempted(int cpu)
+{
+   return 0;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+   .vcpu_is_preempted = native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-- 
2.4.11



  1   2   >