[git pull] Please pull powerpc.git merge branch

2013-06-20 Thread Benjamin Herrenschmidt
Hi Linus !

Please pull this regression fix into 3.10. We accidentally broke
hugetlbfs on Freescale embedded processors which use a slightly
different page table layout than our server processors.

Cheers,
Ben.

The following changes since commit c0691143dfe1d42ec9bd89de5921ccb6a27ea1b3:

  mn10300: Fix include dependency in irqflags.h et al. (2013-06-19 06:29:54 
-1000)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git merge

for you to fetch changes up to 8bbd9f04b7d982d1c6aeb5c08f5983b3d0b9e2fe:

  powerpc: Fix bad pmd error with book3E config (2013-06-20 15:25:21 +1000)


Aneesh Kumar K.V (1):
  powerpc: Fix bad pmd error with book3E config

 arch/powerpc/mm/hugetlbpage.c |8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-20 Thread Benjamin Herrenschmidt
On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
> > Just out of curiosity - would not get_file() and fput_atomic() on a
> group's
> > file* do the right job instead of vfio_group_add_external_user() and
> > vfio_group_del_external_user()?
> 
> I was thinking that too.  Grabbing a file reference would certainly be
> the usual way of handling this sort of thing.

But that wouldn't prevent the group ownership to be returned to
the kernel or another user would it ?

Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 0/7] powerpc/book3e: support kexec and kdump

2013-06-20 Thread Tiejun Chen
This patchset is used to support kexec and kdump on book3e.

Tested on fsl-p5040 DS.

v2:
* rebase on merge branch as Ben mention now.

v1:
* improve some patch head
* rebase on next branch with patch 7


Tiejun Chen (7):
  powerpc/book3e: support CONFIG_RELOCATABLE
  book3e/kexec/kdump: enable kexec for kernel
  book3e/kexec/kdump: create a 1:1 TLB mapping
  book3e/kexec/kdump: introduce a kexec kernel flag
  book3e/kexec/kdump: implement ppc64 kexec specfic
  book3e/kexec/kdump: redefine VIRT_PHYS_OFFSET
  book3e/kexec/kdump: recover "r4 = 0" to create the initial TLB

 arch/powerpc/Kconfig |2 +-
 arch/powerpc/include/asm/exception-64e.h |8 
 arch/powerpc/include/asm/page.h  |2 +
 arch/powerpc/include/asm/smp.h   |3 ++
 arch/powerpc/kernel/exceptions-64e.S |   15 ++-
 arch/powerpc/kernel/head_64.S|   47 +++--
 arch/powerpc/kernel/machine_kexec_64.c   |6 +++
 arch/powerpc/kernel/misc_64.S|   67 +-
 arch/powerpc/lib/feature-fixups.c|7 
 arch/powerpc/platforms/85xx/smp.c|   27 
 10 files changed, 178 insertions(+), 6 deletions(-)

Tiejun
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 1/7] powerpc/book3e: support CONFIG_RELOCATABLE

2013-06-20 Thread Tiejun Chen
book3e is different with book3s since 3s includes the exception
vectors code in head_64.S as it relies on absolute addressing
which is only possible within this compilation unit. So we have
to get that label address with got.

And when boot a relocated kernel, we should reset ipvr properly again
after .relocate.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/include/asm/exception-64e.h |8 
 arch/powerpc/kernel/exceptions-64e.S |   15 ++-
 arch/powerpc/kernel/head_64.S|   22 ++
 arch/powerpc/lib/feature-fixups.c|7 +++
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/exception-64e.h 
b/arch/powerpc/include/asm/exception-64e.h
index 51fa43e..89e940d 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -214,10 +214,18 @@ exc_##label##_book3e:
 #define TLB_MISS_STATS_SAVE_INFO_BOLTED
 #endif
 
+#ifndef CONFIG_RELOCATABLE
 #define SET_IVOR(vector_number, vector_offset) \
li  r3,vector_offset@l; \
ori r3,r3,interrupt_base_book3e@l;  \
mtspr   SPRN_IVOR##vector_number,r3;
+#else
+#define SET_IVOR(vector_number, vector_offset) \
+   LOAD_REG_ADDR(r3,interrupt_base_book3e);\
+   rlwinm  r3,r3,0,15,0;   \
+   ori r3,r3,vector_offset@l;  \
+   mtspr   SPRN_IVOR##vector_number,r3;
+#endif
 
 #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
 
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 645170a..4b23119 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1097,7 +1097,15 @@ skpinv:  addir6,r6,1 /* 
Increment */
  * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping
  */
/* Now we branch the new virtual address mapped by this entry */
+#ifdef CONFIG_RELOCATABLE
+   /* We have to find out address from lr. */
+   bl  1f  /* Find our address */
+1: mflrr6
+   addir6,r6,(2f - 1b)
+   tovirt(r6,r6)
+#else
LOAD_REG_IMMEDIATE(r6,2f)
+#endif
lis r7,MSR_KERNEL@h
ori r7,r7,MSR_KERNEL@l
mtspr   SPRN_SRR0,r6
@@ -1348,9 +1356,14 @@ _GLOBAL(book3e_secondary_thread_init)
mflrr28
b   3b
 
-_STATIC(init_core_book3e)
+_GLOBAL(init_core_book3e)
/* Establish the interrupt vector base */
+#ifdef CONFIG_RELOCATABLE
+   tovirt(r2,r2)
+   LOAD_REG_ADDR(r3, interrupt_base_book3e)
+#else
LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e)
+#endif
mtspr   SPRN_IVPR,r3
sync
blr
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index b61363d..0942f3a 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -414,12 +414,22 @@ _STATIC(__after_prom_start)
/* process relocations for the final address of the kernel */
lis r25,PAGE_OFFSET@highest /* compute virtual base of kernel */
sldir25,r25,32
+#if defined(CONFIG_PPC_BOOK3E)
+   tovirt(r26,r26) /* on booke, we already run at 
PAGE_OFFSET */
+#endif
lwz r7,__run_at_load-_stext(r26)
+#if defined(CONFIG_PPC_BOOK3E)
+   tophys(r26,r26) /* Restore for the remains. */
+#endif
cmplwi  cr0,r7,1/* flagged to stay where we are ? */
bne 1f
add r25,r25,r26
 1: mr  r3,r25
bl  .relocate
+#if defined(CONFIG_PPC_BOOK3E)
+   /* We should set ivpr again after .relocate. */
+   bl  .init_core_book3e
+#endif
 #endif
 
 /*
@@ -447,12 +457,24 @@ _STATIC(__after_prom_start)
  * variable __run_at_load, if it is set the kernel is treated as relocatable
  * kernel, otherwise it will be moved to PHYSICAL_START
  */
+#if defined(CONFIG_PPC_BOOK3E)
+   tovirt(r26,r26) /* on booke, we already run at 
PAGE_OFFSET */
+#endif
lwz r7,__run_at_load-_stext(r26)
+#if defined(CONFIG_PPC_BOOK3E)
+   tophys(r26,r26) /* Restore for the remains. */
+#endif
cmplwi  cr0,r7,1
bne 3f
 
+#ifdef CONFIG_PPC_BOOK3E
+   LOAD_REG_ADDR(r5, interrupt_end_book3e)
+   LOAD_REG_ADDR(r11, _stext)
+   sub r5,r5,r11
+#else
/* just copy interrupts */
LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext)
+#endif
b   5f
 3:
 #endif
diff --git a/arch/powerpc/lib/feature-fixups.c 
b/arch/powerpc/lib/feature-fixups.c
index 7a8a748..13f20ed 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -135,13 +135,20 @@ void do_final_fixups(void)
 #if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE)
int *src, *dest;
unsigned long length;
+#ifdef CONFIG_PPC_BOOK3E
+   extern char interrupt_end_book3e[];
+#endif
 
if (PHYSICAL_START == 0)
return;
 
src = (int 

[v2][PATCH 2/7] book3e/kexec/kdump: enable kexec for kernel

2013-06-20 Thread Tiejun Chen
We need to active KEXEC for book3e and bypass or convert non-book3e stuff
in kexec coverage.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/Kconfig   |2 +-
 arch/powerpc/kernel/machine_kexec_64.c |6 ++
 arch/powerpc/kernel/misc_64.S  |6 ++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c33e3ad..6ecf3c9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -364,7 +364,7 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
 
 config KEXEC
bool "kexec system call"
-   depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+   depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) || PPC_BOOK3E
help
  kexec is a system call that implements the ability to shutdown your
  current kernel, and to start another kernel.  It is like a reboot
diff --git a/arch/powerpc/kernel/machine_kexec_64.c 
b/arch/powerpc/kernel/machine_kexec_64.c
index 611acdf..ef39271 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -33,6 +33,7 @@
 int default_machine_kexec_prepare(struct kimage *image)
 {
int i;
+#ifndef CONFIG_PPC_BOOK3E
unsigned long begin, end;   /* limits of segment */
unsigned long low, high;/* limits of blocked memory range */
struct device_node *node;
@@ -41,6 +42,7 @@ int default_machine_kexec_prepare(struct kimage *image)
 
if (!ppc_md.hpte_clear_all)
return -ENOENT;
+#endif
 
/*
 * Since we use the kernel fault handlers and paging code to
@@ -51,6 +53,7 @@ int default_machine_kexec_prepare(struct kimage *image)
if (image->segment[i].mem < __pa(_end))
return -ETXTBSY;
 
+#ifndef CONFIG_PPC_BOOK3E
/*
 * For non-LPAR, we absolutely can not overwrite the mmu hash
 * table, since we are still using the bolted entries in it to
@@ -92,6 +95,7 @@ int default_machine_kexec_prepare(struct kimage *image)
return -ETXTBSY;
}
}
+#endif
 
return 0;
 }
@@ -367,6 +371,7 @@ void default_machine_kexec(struct kimage *image)
/* NOTREACHED */
 }
 
+#ifndef CONFIG_PPC_BOOK3E
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
 
@@ -411,3 +416,4 @@ static int __init export_htab_values(void)
return 0;
 }
 late_initcall(export_htab_values);
+#endif
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 6820e45..f1a7ce7 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -543,9 +543,13 @@ _GLOBAL(kexec_sequence)
lhz r25,PACAHWCPUID(r13)/* get our phys cpu from paca */
 
/* disable interrupts, we are overwriting kernel data next */
+#ifndef CONFIG_PPC_BOOK3E
mfmsr   r3
rlwinm  r3,r3,0,17,15
mtmsrd  r3,1
+#else
+   wrteei  0
+#endif
 
/* copy dest pages, flush whole dest image */
mr  r3,r29
@@ -567,10 +571,12 @@ _GLOBAL(kexec_sequence)
li  r6,1
stw r6,kexec_flag-1b(5)
 
+#ifndef CONFIG_PPC_BOOK3E
/* clear out hardware hash page table and tlb */
ld  r5,0(r27)   /* deref function descriptor */
mtctr   r5
bctrl   /* ppc_md.hpte_clear_all(void); */
+#endif
 
 /*
  *   kexec image calling is:
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 3/7] book3e/kexec/kdump: create a 1:1 TLB mapping

2013-06-20 Thread Tiejun Chen
book3e have no real MMU mode so we have to create a 1:1 TLB
mapping to make sure we can access the real physical address.
And correct something to support this pseudo real mode on book3e.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/head_64.S |9 ---
 arch/powerpc/kernel/misc_64.S |   55 -
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 0942f3a..3e19ba2 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -444,12 +444,12 @@ _STATIC(__after_prom_start)
tovirt(r3,r3)   /* on booke, we already run at 
PAGE_OFFSET */
 #endif
mr. r4,r26  /* In some cases the loader may  */
+#if defined(CONFIG_PPC_BOOK3E)
+   tovirt(r4,r4)
+#endif
beq 9f  /* have already put us at zero */
li  r6,0x100/* Start offset, the first 0x100 */
/* bytes were copied earlier.*/
-#ifdef CONFIG_PPC_BOOK3E
-   tovirt(r6,r6)   /* on booke, we already run at 
PAGE_OFFSET */
-#endif
 
 #ifdef CONFIG_RELOCATABLE
 /*
@@ -492,6 +492,9 @@ _STATIC(__after_prom_start)
 p_end: .llong  _end - _stext
 
 4: /* Now copy the rest of the kernel up to _end */
+#if defined(CONFIG_PPC_BOOK3E)
+   tovirt(r26,r26)
+#endif
addis   r5,r26,(p_end - _stext)@ha
ld  r5,(p_end - _stext)@l(r5)   /* get _end */
 5: bl  .copy_and_flush /* copy the rest */
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index f1a7ce7..20cbb98 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -460,6 +460,49 @@ kexec_flag:
 
 
 #ifdef CONFIG_KEXEC
+#ifdef CONFIG_PPC_BOOK3E
+/* BOOK3E have no a real MMU mode so we have to setup the initial TLB
+ * for a core to map v:0 to p:0 as 1:1. This current implementation
+ * assume that 1G is enough for kexec.
+ */
+#include 
+kexec_create_tlb:
+   /* Invalidate all TLBs to avoid any TLB conflict. */
+   PPC_TLBILX_ALL(0,R0)
+   sync
+   isync
+
+   mfspr   r10,SPRN_TLB1CFG
+   andi.   r10,r10,TLBnCFG_N_ENTRY /* Extract # entries */
+   subir10,r10,1   /* Often its always safe to use last */
+   lis r9,MAS0_TLBSEL(1)@h
+   rlwimi  r9,r10,16,4,15  /* Setup MAS0 = TLBSEL | ESEL(r9) */
+
+/* Setup a temp mapping v:0 to p:0 as 1:1 and return to it.
+ */
+#ifdef CONFIG_SMP
+#define M_IF_SMP   MAS2_M
+#else
+#define M_IF_SMP   0
+#endif
+   mtspr   SPRN_MAS0,r9
+
+   lis r9,(MAS1_VALID|MAS1_IPROT)@h
+   ori r9,r9,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l
+   mtspr   SPRN_MAS1,r9
+
+   LOAD_REG_IMMEDIATE(r9, 0x0 | M_IF_SMP)
+   mtspr   SPRN_MAS2,r9
+
+   LOAD_REG_IMMEDIATE(r9, 0x0 | MAS3_SR | MAS3_SW | MAS3_SX)
+   mtspr   SPRN_MAS3,r9
+   li  r9,0
+   mtspr   SPRN_MAS7,r9
+
+   tlbwe
+   isync
+   blr
+#endif
 
 /* kexec_smp_wait(void)
  *
@@ -473,6 +516,10 @@ kexec_flag:
  */
 _GLOBAL(kexec_smp_wait)
lhz r3,PACAHWCPUID(r13)
+#ifdef CONFIG_PPC_BOOK3E
+   /* Create a 1:1 mapping. */
+   bl  kexec_create_tlb
+#endif
bl  real_mode
 
li  r4,KEXEC_STATE_REAL_MODE
@@ -489,6 +536,7 @@ _GLOBAL(kexec_smp_wait)
  * don't overwrite r3 here, it is live for kexec_wait above.
  */
 real_mode: /* assume normal blr return */
+#ifndef CONFIG_PPC_BOOK3E
 1: li  r9,MSR_RI
li  r10,MSR_DR|MSR_IR
mflrr11 /* return address to SRR0 */
@@ -500,7 +548,10 @@ real_mode: /* assume normal blr return */
mtspr   SPRN_SRR1,r10
mtspr   SPRN_SRR0,r11
rfid
-
+#else
+   /* the real mode is nothing for book3e. */
+   blr
+#endif
 
 /*
  * kexec_sequence(newstack, start, image, control, clear_all())
@@ -549,6 +600,8 @@ _GLOBAL(kexec_sequence)
mtmsrd  r3,1
 #else
wrteei  0
+   /* Create a 1:1 mapping. */
+   bl  kexec_create_tlb
 #endif
 
/* copy dest pages, flush whole dest image */
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 4/7] book3e/kexec/kdump: introduce a kexec kernel flag

2013-06-20 Thread Tiejun Chen
We need to introduce a flag to indicate we're already running
a kexec kernel then we can go proper path. For example, We
shouldn't access spin_table from the bootloader to up any secondary
cpu for kexec kernel, and kexec kernel already know how to jump to
generic_secondary_smp_init.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/include/asm/smp.h|3 +++
 arch/powerpc/kernel/head_64.S |   12 
 arch/powerpc/kernel/misc_64.S |6 ++
 arch/powerpc/platforms/85xx/smp.c |   14 ++
 4 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ffbaabe..fbc3d9b 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -200,6 +200,9 @@ extern void generic_secondary_thread_init(void);
 extern unsigned long __secondary_hold_spinloop;
 extern unsigned long __secondary_hold_acknowledge;
 extern char __secondary_hold;
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+extern unsigned long __run_at_kexec;
+#endif
 
 extern void __early_start(void);
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 3e19ba2..ffa4b18 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -89,6 +89,12 @@ __secondary_hold_spinloop:
 __secondary_hold_acknowledge:
.llong  0x0
 
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+   .globl  __run_at_kexec
+__run_at_kexec:
+   .llong  0x0 /* Flag for the secondary kernel from kexec. */
+#endif
+
 #ifdef CONFIG_RELOCATABLE
/* This flag is set to 1 by a loader if the kernel should run
 * at the loaded address instead of the linked address.  This
@@ -417,6 +423,12 @@ _STATIC(__after_prom_start)
 #if defined(CONFIG_PPC_BOOK3E)
tovirt(r26,r26) /* on booke, we already run at 
PAGE_OFFSET */
 #endif
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+   /* If relocated we need to restore this flag on that relocated address. 
*/
+   ld  r7,__run_at_kexec-_stext(r26)
+   std r7,__run_at_kexec-_stext(r26)
+#endif
+
lwz r7,__run_at_load-_stext(r26)
 #if defined(CONFIG_PPC_BOOK3E)
tophys(r26,r26) /* Restore for the remains. */
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 20cbb98..c89aead 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -619,6 +619,12 @@ _GLOBAL(kexec_sequence)
bl  .copy_and_flush /* (dest, src, copy limit, start offset) */
 1: /* assume normal blr return */
 
+   /* notify we're going into kexec kernel for SMP. */
+   LOAD_REG_ADDR(r3,__run_at_kexec)
+   li  r4,1
+   std r4,0(r3)
+   sync
+
/* release other cpus to the new kernel secondary start at 0x60 */
mflrr5
li  r6,1
diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index 6a17599..b308373 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -150,6 +150,9 @@ static int __cpuinit smp_85xx_kick_cpu(int nr)
int hw_cpu = get_hard_smp_processor_id(nr);
int ioremappable;
int ret = 0;
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+   unsigned long *ptr;
+#endif
 
WARN_ON(nr < 0 || nr >= NR_CPUS);
WARN_ON(hw_cpu < 0 || hw_cpu >= NR_CPUS);
@@ -238,11 +241,22 @@ out:
 #else
smp_generic_kick_cpu(nr);
 
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+   ptr  = (unsigned long *)((unsigned long)&__run_at_kexec);
+   /* We shouldn't access spin_table from the bootloader to up any
+* secondary cpu for kexec kernel, and kexec kernel already
+* know how to jump to generic_secondary_smp_init.
+*/
+   if (!*ptr) {
+#endif
flush_spin_table(spin_table);
out_be32(&spin_table->pir, hw_cpu);
out_be64((u64 *)(&spin_table->addr_h),
  __pa((u64)*((unsigned long long *)generic_secondary_smp_init)));
flush_spin_table(spin_table);
+#if defined(CONFIG_KEXEC) || defined(CONFIG_CRASH_DUMP)
+   }
+#endif
 #endif
 
local_irq_restore(flags);
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 5/7] book3e/kexec/kdump: implement ppc64 kexec specfic

2013-06-20 Thread Tiejun Chen
ppc64 kexec mechanism has a different implementation with ppc32.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/platforms/85xx/smp.c |   13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index b308373..18a5f8a 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -280,6 +280,7 @@ struct smp_ops_t smp_85xx_ops = {
 };
 
 #ifdef CONFIG_KEXEC
+#ifdef CONFIG_PPC32
 atomic_t kexec_down_cpus = ATOMIC_INIT(0);
 
 void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary)
@@ -298,6 +299,14 @@ static void mpc85xx_smp_kexec_down(void *arg)
if (ppc_md.kexec_cpu_down)
ppc_md.kexec_cpu_down(0,1);
 }
+#else
+void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+   local_irq_disable();
+   hard_irq_disable();
+   mpic_teardown_this_cpu(secondary);
+}
+#endif
 
 static void map_and_flush(unsigned long paddr)
 {
@@ -349,11 +358,14 @@ static void mpc85xx_smp_flush_dcache_kexec(struct kimage 
*image)
 
 static void mpc85xx_smp_machine_kexec(struct kimage *image)
 {
+#ifdef CONFIG_PPC32
int timeout = INT_MAX;
int i, num_cpus = num_present_cpus();
+#endif
 
mpc85xx_smp_flush_dcache_kexec(image);
 
+#ifdef CONFIG_PPC32
if (image->type == KEXEC_TYPE_DEFAULT)
smp_call_function(mpc85xx_smp_kexec_down, NULL, 0);
 
@@ -371,6 +383,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
if ( i == smp_processor_id() ) continue;
mpic_reset_core(i);
}
+#endif
 
default_machine_kexec(image);
 }
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 6/7] book3e/kexec/kdump: redefine VIRT_PHYS_OFFSET

2013-06-20 Thread Tiejun Chen
Book3e is always aligned 1GB to create TLB so we should
use (KERNELBASE - MEMORY_START) as VIRT_PHYS_OFFSET to
get __pa/__va properly while boot kdump.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/include/asm/page.h |2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 988c812..5b00081 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -112,6 +112,8 @@ extern long long virt_phys_offset;
 /* See Description below for VIRT_PHYS_OFFSET */
 #ifdef CONFIG_RELOCATABLE_PPC32
 #define VIRT_PHYS_OFFSET virt_phys_offset
+#elif defined(CONFIG_PPC_BOOK3E_64)
+#define VIRT_PHYS_OFFSET (KERNELBASE - MEMORY_START)
 #else
 #define VIRT_PHYS_OFFSET (KERNELBASE - PHYSICAL_START)
 #endif
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v2][PATCH 7/7] book3e/kexec/kdump: recover "r4 = 0" to create the initial TLB

2013-06-20 Thread Tiejun Chen
In commit 96f013f, "powerpc/kexec: Add kexec "hold" support for Book3e
processors", requires that GPR4 survive the "hold" process, for IBM Blue
Gene/Q with with some very strange firmware. But for FSL Book3E, r4 = 1
to indicate that the initial TLB entry for this core already exists so
we still should set r4 with 0 to create that initial TLB.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/head_64.S |4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index ffa4b18..63ed1c3 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -129,6 +129,10 @@ __secondary_hold:
/* Grab our physical cpu number */
mr  r24,r3
/* stash r4 for book3e */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+   /* we need to setup initial TLB entry. */
+   li  r4,0
+#endif
mr  r25,r4
 
/* Tell the master cpu we're here */
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-20 Thread Alexey Kardashevskiy
On 06/20/2013 05:47 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
>>> Just out of curiosity - would not get_file() and fput_atomic() on a
>> group's
>>> file* do the right job instead of vfio_group_add_external_user() and
>>> vfio_group_del_external_user()?
>>
>> I was thinking that too.  Grabbing a file reference would certainly be
>> the usual way of handling this sort of thing.
> 
> But that wouldn't prevent the group ownership to be returned to
> the kernel or another user would it ?


Holding the file pointer does not let the group->container_users counter go
to zero and this is exactly what vfio_group_add_external_user() and
vfio_group_del_external_user() do. The difference is only in absolute value
- 2 vs. 3.

No change in behaviour whether I use new vfio API or simply hold file* till
KVM closes fd created when IOMMU was connected to LIOBN.

And while this counter is not zero, QEMU cannot take ownership over the group.

I am definitely still missing the bigger picture...


-- 
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 03/15] powerpc/THP: Implement transparent hugepages for ppc64

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We now have pmd entries covering 16MB range and the PMD table double its 
original size.
We use the second half of the PMD table to deposit the pgtable (PTE page).
The depoisted PTE page is further used to track the HPTE information. The 
information
include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE 
entry.
With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need
4096 entries. Both will fit in a 4K PTE page. On hugepage invalidate we need to 
walk
the PTE page and invalidate all valid HPTEs.

This patch implements necessary arch specific functions for THP support and also
hugepage invalidate logic. These PMD related functions are intentionally kept
similar to their PTE counter-part.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 215 +-
 arch/powerpc/include/asm/pgtable.h   |   4 +
 arch/powerpc/include/asm/tlbflush.h  |   3 +-
 arch/powerpc/mm/pgtable_64.c | 377 +++
 arch/powerpc/mm/tlb_hash64.c |  27 +++
 arch/powerpc/platforms/Kconfig.cputype   |   1 +
 6 files changed, 625 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index ab84332..8f9da5e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -10,6 +10,7 @@
 #else
 #include 
 #endif
+#include 
 
 #define FIRST_USER_ADDRESS 0
 
@@ -154,7 +155,7 @@
 #definepmd_present(pmd)(pmd_val(pmd) != 0)
 #definepmd_clear(pmdp) (pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)(pmd_val(pmd) & ~PMD_MASKED_BITS)
-#define pmd_page(pmd)  virt_to_page(pmd_page_vaddr(pmd))
+extern struct page *pmd_page(pmd_t pmd);
 
 #define pud_set(pudp, pudval)  (pud_val(*(pudp)) = (pudval))
 #define pud_none(pud)  (!pud_val(pud))
@@ -382,4 +383,216 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 
 #endif /* __ASSEMBLY__ */
 
+/*
+ * THP pages can't be special. So use the _PAGE_SPECIAL
+ */
+#define _PAGE_SPLITTING _PAGE_SPECIAL
+
+/*
+ * We need to differentiate between explicit huge page and THP huge
+ * page, since THP huge page also need to track real subpage details
+ */
+#define _PAGE_THP_HUGE  _PAGE_4K_PFN
+
+/*
+ * set of bits not changed in pmd_modify.
+ */
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |  \
+_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
+_PAGE_THP_HUGE)
+
+#ifndef __ASSEMBLY__
+/*
+ * The linux hugepage PMD now include the pmd entries followed by the address
+ * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
+ * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
+ * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
+ *
+ * The last three bits are intentionally left to zero. This memory location
+ * are also used as normal page PTE pointers. So if we have any pointers
+ * left around while we collapse a hugepage, we need to make sure
+ * _PAGE_PRESENT and _PAGE_FILE bits of that are zero when we look at them
+ */
+static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int 
index)
+{
+   return (hpte_slot_array[index] >> 3) & 0x1;
+}
+
+static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
+  int index)
+{
+   return hpte_slot_array[index] >> 4;
+}
+
+static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
+   unsigned int index, unsigned int hidx)
+{
+   hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+}
+
+static inline char *get_hpte_slot_array(pmd_t *pmdp)
+{
+   /*
+* The hpte hindex is stored in the pgtable whose address is in the
+* second half of the PMD
+*
+* Order this load with the test for pmd_trans_huge in the caller
+*/
+   smp_rmb();
+   return *(char **)(pmdp + PTRS_PER_PMD);
+
+
+}
+
+extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+  pmd_t *pmdp);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
+extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
+extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+  pmd_t *pmdp, pmd_t pmd);
+extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long 
addr,
+pmd_t *pmd);
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+   /*
+* leaf pte for huge page, bottom two bits != 00
+*/
+   return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_TH

[PATCH -V11 01/15] THP support for PPC64

2013-06-20 Thread Aneesh Kumar K.V
Hi,

This is the second patchset needed to support THP on ppc64.

mm/ changes are already merged to Andrew's -mm tree.

 mm/THP: deposit the transpare huge pgtable before set_pmd
 mm/THP: Don't use HPAGE_SHIFT in transparent hugepage code
 mm/THP: withdraw the pgtable after pmdp related operations
 mm/THP: Add pmd args to pgtable deposit and withdraw APIs
 mm/THP: Use the right function when updating access flags

Some numbers:

The latency measurements code from Anton  found at
http://ozlabs.org/~anton/junkcode/latency2001.c

64K page size (With THP support)
--
[root@llmp24l02 test]# ./latency2001 8G
 8589934592428.49 cycles120.50 ns
[root@llmp24l02 test]# ./latency2001 -l 8G
 8589934592471.16 cycles132.50 ns
[root@llmp24l02 test]# echo never > /sys/kernel/mm/transparent_hugepage/enabled 
[root@llmp24l02 test]# ./latency2001 8G
 8589934592766.52 cycles215.56 ns
[root@llmp24l02 test]# 

4K page size (No THP support for 4K)

[root@llmp24l02 test]# ./latency2001 8G
 8589934592814.88 cycles229.16 ns
[root@llmp24l02 test]# ./latency2001 -l 8G
 8589934592463.69 cycles130.40 ns
[root@llmp24l02 test]# 

We are close to hugetlbfs in latency and we can achieve this with zero
config/page reservation. Most of the allocations above are fault allocated.

Another test that does 5000 random access over 1GB area goes from
2.65 seconds to 1.07 seconds with this patchset.

split_huge_page impact:
-
To look at the performance impact of large page invalidate, I tried the below
experiment. The test involved, accessing a large contiguous region of memory
location as below

for (i = 0; i < size; i += PAGE_SIZE)
data[i] = i;

We wanted to access the data in sequential order so that we look at the
worst case THP performance. Accesing the data in sequential order implies
we have the Page table cached and overhead of TLB miss is as minimal as
possible. We also don't touch the entire page, because that can result in
cache evict.

After we touched the full range as above, we now call mprotect on each
of that page. A mprotect will result in a hugepage split. This should
allow us to measure the impact of hugepage split.

for (i = 0; i < size; i += PAGE_SIZE)
 mprotect(&data[i], PAGE_SIZE, PROT_READ);

Split hugepage impact: 
-
THP enabled: 2.851561705 seconds for test completion
THP disable: 3.599146098 seconds for test completion

We are 20.7% better than non THP case even when we have all the large pages 
split.

Detailed output:

THP enabled:
---
[root@llmp24l02 ~]# cat /proc/vmstat  | grep thp
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
[root@llmp24l02 ~]# /root/thp/tools/perf/perf stat -e 
page-faults,dTLB-load-misses ./split-huge-page-mpro 20G 
 
time taken to touch all the data in ns: 2763096913 

 Performance counter stats for './split-huge-page-mpro 20G':

 1,581 page-faults 
 3,159 dTLB-load-misses

   2.851561705 seconds time elapsed

[root@llmp24l02 ~]# 
[root@llmp24l02 ~]# cat /proc/vmstat  | grep thp
thp_fault_alloc 1279
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 1279
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0
[root@llmp24l02 ~]# 

77.05%  split-huge-page  [kernel.kallsyms] [k] .clear_user_page 
   
 7.10%  split-huge-page  [kernel.kallsyms] [k] .perf_event_mmap_ctx 
   
 1.51%  split-huge-page  split-huge-page-mpro  [.] 0x0a70   
   
 0.96%  split-huge-page  [unknown] [H] 0x0157e3bc   
   
 0.81%  split-huge-page  [kernel.kallsyms] [k] .up_write
   
 0.76%  split-huge-page  [kernel.kallsyms] [k] .perf_event_mmap 
   
 0.76%  split-huge-page  [kernel.kallsyms] [k] .down_write  
   
 0.74%  split-huge-page  [kernel.kallsyms] [k] .lru_add_page_tail   
   
 0.61%  split-huge-page  [kernel.kallsyms] [k] .split_huge_page 
   
 0.59%  split-huge-page  [kernel.kallsyms] [k] .change_protection   
   
 0.51%  split-huge-page  [kernel.kallsyms] [k] .release_pages   
   

 0.96%  split-huge-page  [unknown] [H] 0x0157e3bc   
   
|  
|--79.44%-- reloc_start
|  |  
|  |--86.54%-- .__pSeries_lpar_hugepage_invalidate
|  |  .pSeries_lpar_hugepage_inva

[PATCH -V11 01/15] powerpc/mm: handle hugepage size correctly when invalidating hpte entries

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

If a hash bucket gets full, we "evict" a more/less random entry from it.
When we do that we don't invalidate the TLB (hpte_remove) because we assume
the old translation is still technically "valid". This implies that when
we are invalidating or updating pte, even if HPTE entry is not valid
we should do a tlb invalidate. With hugepages, we need to pass the correct
actual page size value for tlb invalidation.

This change update the patch 0608d692463598c1d6e826d9dd7283381b4f246c
"powerpc/mm: Always invalidate tlb on hpte invalidate and update" to handle
transparent hugepages correctly.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/machdep.h  |   8 +--
 arch/powerpc/kvm/book3s_64_mmu_host.c   |   2 +-
 arch/powerpc/mm/hash_low_64.S   |  21 +++---
 arch/powerpc/mm/hash_native_64.c| 122 
 arch/powerpc/mm/hash_utils_64.c |   9 ++-
 arch/powerpc/mm/hugetlbpage-hash64.c|   2 +-
 arch/powerpc/platforms/cell/beat_htab.c |  16 +++--
 arch/powerpc/platforms/ps3/htab.c   |   5 +-
 arch/powerpc/platforms/pseries/lpar.c   |  17 +++--
 9 files changed, 95 insertions(+), 107 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 92386fc..801e3c6 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -36,13 +36,13 @@ struct machdep_calls {
 #ifdef CONFIG_PPC64
void(*hpte_invalidate)(unsigned long slot,
   unsigned long vpn,
-  int psize, int ssize,
-  int local);
+  int bpsize, int apsize,
+  int ssize, int local);
long(*hpte_updatepp)(unsigned long slot, 
 unsigned long newpp, 
 unsigned long vpn,
-int psize, int ssize,
-int local);
+int bpsize, int apsize,
+int ssize, int local);
void(*hpte_updateboltedpp)(unsigned long newpp, 
   unsigned long ea,
   int psize, int ssize);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c 
b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 3a9a1ac..176d3fd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -34,7 +34,7 @@
 void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
-  MMU_PAGE_4K, MMU_SEGSIZE_256M,
+  MMU_PAGE_4K, MMU_PAGE_4K, MMU_SEGSIZE_256M,
   false);
 }
 
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980ac..d3cbda6 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
 
/* Call ppc_md.hpte_updatepp */
mr  r5,r29  /* vpn */
-   li  r6,MMU_PAGE_4K  /* page size */
-   ld  r7,STK_PARAM(R9)(r1)/* segment size */
-   ld  r8,STK_PARAM(R8)(r1)/* get "local" param */
+   li  r6,MMU_PAGE_4K  /* base page size */
+   li  r7,MMU_PAGE_4K  /* actual page size */
+   ld  r8,STK_PARAM(R9)(r1)/* segment size */
+   ld  r9,STK_PARAM(R8)(r1)/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
bl  .   /* Patched by htab_finish_init() */
 
@@ -649,9 +650,10 @@ htab_modify_pte:
 
/* Call ppc_md.hpte_updatepp */
mr  r5,r29  /* vpn */
-   li  r6,MMU_PAGE_4K  /* page size */
-   ld  r7,STK_PARAM(R9)(r1)/* segment size */
-   ld  r8,STK_PARAM(R8)(r1)/* get "local" param */
+   li  r6,MMU_PAGE_4K  /* base page size */
+   li  r7,MMU_PAGE_4K  /* actual page size */
+   ld  r8,STK_PARAM(R9)(r1)/* segment size */
+   ld  r9,STK_PARAM(R8)(r1)/* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
bl  .   /* patched by htab_finish_init() */
 
@@ -937,9 +939,10 @@ ht64_modify_pte:
 
/* Call ppc_md.hpte_updatepp */
mr  r5,r29  /* vpn */
-   li  r6,MMU_PAGE_64K
-   ld  r7,STK_PARAM(R9)(r1)/* segment size */
-   ld  r8,STK_PARAM(R8)(r1)/* get "local" param */
+   li  r6,MMU_PAGE_64K /* base page size */
+   li  r7,MMU_PAGE_64K /* actual page size */
+   ld  r8,STK_PARAM(R9)(r1)/* segment si

[PATCH -V11 02/15] powerpc/THP: Double the PMD table size for THP

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

THP code does PTE page allocation along with large page request and deposit them
for later use. This is to ensure that we won't have any failures when we split
hugepages to regular pages.

On powerpc we want to use the deposited PTE page for storing hash pte slot and
secondary bit information for the HPTEs. We use the second half
of the pmd table to save the deposted PTE page.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgalloc-64.h| 6 +++---
 arch/powerpc/include/asm/pgtable-ppc64-64k.h | 3 ++-
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 +-
 arch/powerpc/mm/init_64.c| 9 ++---
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
b/arch/powerpc/include/asm/pgalloc-64.h
index b66ae72..f65e27b 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -221,17 +221,17 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, 
pgtable_t table,
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-   return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
+   return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX),
GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-   kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
+   kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd, addr)   \
-   pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
+   pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)   \
pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h 
b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
index 45142d6..a56b82f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h
@@ -33,7 +33,8 @@
 #define PGDIR_MASK (~(PGDIR_SIZE-1))
 
 /* Bits to mask out from a PMD to get to the PTE page */
-#define PMD_MASKED_BITS0x1ff
+/* PMDs point to PTE table fragments which are 4K aligned.  */
+#define PMD_MASKED_BITS0xfff
 /* Bits to mask out from a PGD/PUD to get to the PMD page */
 #define PUD_MASKED_BITS0x1ff
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f..ab84332 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -20,7 +20,11 @@
PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
 #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE)
 
-
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX(PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEXPMD_INDEX_SIZE
+#endif
 /*
  * Define the address range of the kernel non-linear virtual area
  */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c4..d0cd9e4 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
 
 static void pmd_ctor(void *addr)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
memset(addr, 0, PMD_TABLE_SIZE);
+#endif
 }
 
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void 
*))
 void pgtable_cache_init(void)
 {
pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-   pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
-   if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+   pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+   if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
panic("Couldn't allocate pgtable caches");
-
/* In all current configs, when the PUD index exists it's the
 * same size as either the pgd or pmd index.  Verify that the
 * initialization above has also created a PUD cache.  This
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 06/15] powerpc: Replace find_linux_pte with find_linux_pte_or_hugepte

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

Replace find_linux_pte with find_linux_pte_or_hugepte and explicitly
document why we don't need to handle transparent hugepages at callsites.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 24 
 arch/powerpc/kernel/io-workarounds.c | 11 +--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  2 +-
 arch/powerpc/mm/hash_utils_64.c  |  8 +++-
 arch/powerpc/mm/hugetlbpage.c|  8 ++--
 arch/powerpc/mm/tlb_hash64.c |  9 +++--
 arch/powerpc/platforms/pseries/eeh.c |  7 ++-
 7 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 6c9323f..e71bd25 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -344,30 +344,6 @@ static inline void __ptep_set_access_flags(pte_t *ptep, 
pte_t entry)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
-
-/*
- * find_linux_pte returns the address of a linux pte for a given
- * effective address and directory.  If not found, it returns zero.
- */
-static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea)
-{
-   pgd_t *pg;
-   pud_t *pu;
-   pmd_t *pm;
-   pte_t *pt = NULL;
-
-   pg = pgdir + pgd_index(ea);
-   if (!pgd_none(*pg)) {
-   pu = pud_offset(pg, ea);
-   if (!pud_none(*pu)) {
-   pm = pmd_offset(pu, ea);
-   if (pmd_present(*pm))
-   pt = pte_offset_kernel(pm, ea);
-   }
-   }
-   return pt;
-}
-
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/kernel/io-workarounds.c 
b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7..fa0b54b 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, 
unsigned long paddr)
 
 struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
 {
+   unsigned hugepage_shift;
struct iowa_bus *bus;
int token;
 
@@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
return NULL;
 
-   ptep = find_linux_pte(init_mm.pgd, vaddr);
+   ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+&hugepage_shift);
if (ptep == NULL)
paddr = 0;
-   else
+   else {
+   /*
+* we don't have hugepages backing iomem
+*/
+   WARN_ON(hugepage_shift);
paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+   }
bus = iowa_pci_find(vaddr, paddr);
 
if (bus == NULL)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6dcbb49..dcf892d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -27,7 +27,7 @@ static void *real_vmalloc_addr(void *x)
unsigned long addr = (unsigned long) x;
pte_t *p;
 
-   p = find_linux_pte(swapper_pg_dir, addr);
+   p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
if (!p || !pte_present(*p))
return NULL;
/* assume we don't have huge pages in vmalloc space... */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2f47080..e8434ca 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1145,6 +1145,7 @@ EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
  unsigned long access, unsigned long trap)
 {
+   int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
pte_t *ptep;
@@ -1166,10 +1167,15 @@ void hash_preload(struct mm_struct *mm, unsigned long 
ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
-   ptep = find_linux_pte(pgdir, ea);
+   /*
+* THP pages use update_mmu_cache_pmd. We don't do
+* hash preload there. Hence can ignore THP here
+*/
+   ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
if (!ptep)
return;
 
+   WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
/* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
 * a 64K kernel), then we don't preload, hash_page() will take
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 4928204..8add580 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -105,6 +105,7 @@ int pgd_huge(pgd_t pgd)
 
 pte_t *huge_pte_offset(struct mm_

[PATCH -V11 05/15] powerpc: Update find_linux_pte_or_hugepte to handle transparent hugepages

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/hugetlbpage.c | 32 ++--
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 2865077..4928204 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -936,30 +936,50 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned 
long ea, unsigned *shift
 
pg = pgdir + pgd_index(ea);
 
-   if (pgd_huge(*pg)) {
+   /*
+* we should first check for none. That takes care of a
+* a parallel hugetlb or THP pagefault moving none entries
+* to respective types.
+*/
+   if (pgd_none(*pg))
+   return NULL;
+   else if (pgd_huge(*pg)) {
ret_pte = (pte_t *) pg;
goto out;
} else if (is_hugepd(pg))
hpdp = (hugepd_t *)pg;
-   else if (!pgd_none(*pg)) {
+   else {
pdshift = PUD_SHIFT;
pu = pud_offset(pg, ea);
 
-   if (pud_huge(*pu)) {
+   if (pud_none(*pu))
+   return NULL;
+   else if (pud_huge(*pu)) {
ret_pte = (pte_t *) pu;
goto out;
} else if (is_hugepd(pu))
hpdp = (hugepd_t *)pu;
-   else if (!pud_none(*pu)) {
+   else {
pdshift = PMD_SHIFT;
pm = pmd_offset(pu, ea);
+   /*
+* A hugepage collapse is captured by pmd_none, because
+* it mark the pmd none and do a hpte invalidate.
+*
+* A hugepage split is captured by pmd_trans_splitting
+* because we mark the pmd trans splitting and do a
+* hpte invalidate
+*
+*/
+   if (pmd_none(*pm) || pmd_trans_splitting(*pm))
+   return NULL;
 
-   if (pmd_huge(*pm)) {
+   if (pmd_huge(*pm) || pmd_large(*pm)) {
ret_pte = (pte_t *) pm;
goto out;
} else if (is_hugepd(pm))
hpdp = (hugepd_t *)pm;
-   else if (!pmd_none(*pm))
+   else
return pte_offset_kernel(pm, ea);
}
}
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 04/15] powerpc: move find_linux_pte_or_hugepte and gup_hugepte to common code

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We will use this in the later patch for handling THP pages

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/hugetlb.h   |   8 +-
 arch/powerpc/include/asm/pgtable-ppc64.h |  13 --
 arch/powerpc/include/asm/pgtable.h   |   2 +
 arch/powerpc/mm/Makefile |   2 +-
 arch/powerpc/mm/hugetlbpage.c| 251 ---
 5 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index f2498c8..d750336 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -191,8 +191,14 @@ static inline void flush_hugetlb_page(struct 
vm_area_struct *vma,
  unsigned long vmaddr)
 {
 }
-#endif /* CONFIG_HUGETLB_PAGE */
 
+#define hugepd_shift(x) 0
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+   unsigned pdshift)
+{
+   return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
 
 /*
  * FSL Book3E platforms require special gpage handling - the gpages
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 8f9da5e..6c9323f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -368,19 +368,6 @@ static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned 
long ea)
return pt;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-unsigned *shift);
-#else
-static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-  unsigned *shift)
-{
-   if (shift)
-   *shift = 0;
-   return find_linux_pte(pgdir, ea);
-}
-#endif /* !CONFIG_HUGETLB_PAGE */
-
 #endif /* __ASSEMBLY__ */
 
 /*
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index d53db93..959d575 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -224,6 +224,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
 #define pmd_large(pmd) 0
 #define has_transparent_hugepage() 0
 #endif
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+unsigned *shift);
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b57..fde36e6 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -28,8 +28,8 @@ obj-$(CONFIG_44x) += 44x_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)   += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)+= slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-y  += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += hugetlbpage-book3e.o
 endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 237c8e5..2865077 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
 #include 
 #include 
 #include 
+#include 
+
+#ifdef CONFIG_HUGETLB_PAGE
 
 #define PAGE_SHIFT_64K 16
 #define PAGE_SHIFT_16M 24
@@ -100,66 +103,6 @@ int pgd_huge(pgd_t pgd)
 }
 #endif
 
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of 
table
- */
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned 
*shift)
-{
-   pgd_t *pg;
-   pud_t *pu;
-   pmd_t *pm;
-   pte_t *ret_pte;
-   hugepd_t *hpdp = NULL;
-   unsigned pdshift = PGDIR_SHIFT;
-
-   if (shift)
-   *shift = 0;
-
-   pg = pgdir + pgd_index(ea);
-
-   if (pgd_huge(*pg)) {
-   ret_pte = (pte_t *) pg;
-   goto out;
-   } else if (is_hugepd(pg))
-   hpdp = (hugepd_t *)pg;
-   else if (!pgd_none(*pg)) {
-   pdshift = PUD_SHIFT;
-   pu = pud_offset(pg, ea);
-
-   if (pud_huge(*pu)) {
-   ret_pte = (pte_t *) pu;
-   goto out;
-   } else if (is_hugepd(pu))
-   hpdp = (hugepd_t *)pu;
-   else if (!pud_none(*pu)) {
-   pdshift = PMD_SHIFT;
-   pm = pmd_offset(pu, ea);
-
-   if (pmd_huge(*pm)) {
-   ret_pte = (pte_t *) pm;
-   goto out;
-   } else if (is_hugepd(pm))
-   hpdp = (hugepd_t *)pm;
-   else if (!pmd_n

[PATCH -V11 07/15] powerpc/kvm: Handle transparent hugepage in KVM

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We can find pte that are splitting while walking page tables. Return
None pte in that case.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 58 +++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  8 +++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 12 +++
 3 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9c1ff33..a1ecb14 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -159,36 +159,46 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, 
unsigned long io_type)
 }
 
 /*
- * Lock and read a linux PTE.  If it's present and writable, atomically
- * set dirty and referenced bits and return the PTE, otherwise return 0.
+ * If it's present and writable, atomically set dirty and referenced bits and
+ * return the PTE, otherwise return 0. If we find a transparent hugepage
+ * and if it is marked splitting we return 0;
  */
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
+unsigned int hugepage)
 {
-   pte_t pte, tmp;
-
-   /* wait until _PAGE_BUSY is clear then set it atomically */
-   __asm__ __volatile__ (
-   "1: ldarx   %0,0,%3\n"
-   "   andi.   %1,%0,%4\n"
-   "   bne-1b\n"
-   "   ori %1,%0,%4\n"
-   "   stdcx.  %1,0,%3\n"
-   "   bne-1b"
-   : "=&r" (pte), "=&r" (tmp), "=m" (*p)
-   : "r" (p), "i" (_PAGE_BUSY)
-   : "cc");
-
-   if (pte_present(pte)) {
-   pte = pte_mkyoung(pte);
-   if (writing && pte_write(pte))
-   pte = pte_mkdirty(pte);
-   }
+   pte_t old_pte, new_pte = __pte(0);
+
+   while (1) {
+   old_pte = pte_val(*ptep);
+   /*
+* wait until _PAGE_BUSY is clear then set it atomically
+*/
+   if (unlikely(old_pte & _PAGE_BUSY)) {
+   cpu_relax();
+   continue;
+   }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   /* If hugepage and is trans splitting return None */
+   if (unlikely(hugepage &&
+pmd_trans_splitting(pte_pmd(old_pte
+   return __pte(0);
+#endif
+   /* If pte is not present return None */
+   if (unlikely(!(old_pte & _PAGE_PRESENT)))
+   return __pte(0);
 
-   *p = pte;   /* clears _PAGE_BUSY */
+   new_pte = pte_mkyoung(old_pte);
+   if (writing && pte_write(old_pte))
+   new_pte = pte_mkdirty(new_pte);
 
-   return pte;
+   if (old_pte == __cmpxchg_u64((unsigned long *)ptep, old_pte,
+new_pte))
+   break;
+   }
+   return new_pte;
 }
 
+
 /* Return HPTE cache control bits corresponding to Linux pte bits */
 static inline unsigned long hpte_cache_bits(unsigned long pte_val)
 {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 5880dfb..710d313 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
}
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
+   unsigned int hugepage_shift;
pte_t *ptep, pte;
 
/*
@@ -683,9 +684,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 */
rcu_read_lock_sched();
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
-hva, NULL);
-   if (ptep && pte_present(*ptep)) {
-   pte = kvmppc_read_update_linux_pte(ptep, 1);
+hva, &hugepage_shift);
+   if (ptep) {
+   pte = kvmppc_read_update_linux_pte(ptep, 1,
+  hugepage_shift);
if (pte_write(pte))
write_ok = 1;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index dcf892d..fc25689 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -139,20 +139,18 @@ static pte_t look

[PATCH -V11 08/15] powerpc: Update gup_pmd_range to handle transparent hugepages

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/gup.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921af..223a255 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -66,9 +66,15 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
pmd_t pmd = *pmdp;
 
next = pmd_addr_end(addr, end);
-   if (pmd_none(pmd))
+   /*
+* If we find a splitting transparent hugepage we
+* return zero. That will result in taking the slow
+* path which will call wait_split_huge_page()
+* if the pmd is still in splitting state
+*/
+   if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
-   if (pmd_huge(pmd)) {
+   if (pmd_huge(pmd) || pmd_large(pmd)) {
if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
 write, pages, nr))
return 0;
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 09/15] powerpc/THP: Add code to handle HPTE faults for hugepages

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

The deposted PTE page in the second half of the PMD table is used to
track the state on hash PTEs. After updating the HPTE, we mark the
coresponding slot in the deposted PTE page valid.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/mmu-hash64.h |  13 +++
 arch/powerpc/mm/Makefile  |   1 +
 arch/powerpc/mm/hash_utils_64.c   |  21 -
 arch/powerpc/mm/hugepage-hash64.c | 172 ++
 4 files changed, 203 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/mm/hugepage-hash64.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 2accc96..3d6fbb0 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -340,6 +340,19 @@ extern int hash_page(unsigned long ea, unsigned long 
access, unsigned long trap)
 int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long 
vsid,
 pte_t *ptep, unsigned long trap, int local, int ssize,
 unsigned int shift, unsigned int mmu_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __hash_page_thp(unsigned long ea, unsigned long access,
+  unsigned long vsid, pmd_t *pmdp, unsigned long trap,
+  int local, int ssize, unsigned int psize);
+#else
+static inline int __hash_page_thp(unsigned long ea, unsigned long access,
+ unsigned long vsid, pmd_t *pmdp,
+ unsigned long trap, int local,
+ int ssize, unsigned int psize)
+{
+   BUG();
+}
+#endif
 extern void hash_failure_debug(unsigned long ea, unsigned long access,
   unsigned long vsid, unsigned long trap,
   int ssize, int psize, int lpsize,
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index fde36e6..87671eb 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -33,6 +33,7 @@ ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += hugetlbpage-book3e.o
 endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)  += highmem.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e8434ca..7a81e86 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, 
unsigned long trap)
goto bail;
}
 
-#ifdef CONFIG_HUGETLB_PAGE
if (hugeshift) {
-   rc = __hash_page_huge(ea, access, vsid, ptep, trap, local,
-   ssize, hugeshift, psize);
+   if (pmd_trans_huge(*(pmd_t *)ptep))
+   rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+   else
+   rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+ local, ssize, hugeshift, psize);
+#else
+   else {
+   /*
+* if we have hugeshift, and is not transhuge with
+* hugetlb disabled, something is really wrong.
+*/
+   rc = 1;
+   WARN_ON(1);
+   }
+#endif
goto bail;
}
-#endif /* CONFIG_HUGETLB_PAGE */
 
 #ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
diff --git a/arch/powerpc/mm/hugepage-hash64.c 
b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 000..3c22fa3
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include 
+#include 
+
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+   pmd_t *pmdp, unsigned long trap, int local, int ssize,
+   unsigned int psize)
+{
+   unsigned int index, valid;
+   unsigned char *hpte_slot_array;
+   unsigned long rflags, pa, hidx;
+   unsigned long old_pmd, new_pmd;
+   int ret, lpsize = MM

[PATCH -V11 13/15] powerpc: split hugepage when using subpage protection

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We find all the overlapping vma and mark them such that we don't allocate
hugepage in that range. Also we split existing huge page so that the
normal page hash can be invalidated and new page faulted in with new
protection bits.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/subpage-prot.c | 48 ++
 1 file changed, 48 insertions(+)

diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415dd..aa74acb 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, 
unsigned long len)
up_write(&mm->mmap_sem);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+   struct vm_area_struct *vma = walk->private;
+   split_huge_page_pmd(vma, addr, pmd);
+   return 0;
+}
+
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+   unsigned long len)
+{
+   struct vm_area_struct *vma;
+   struct mm_walk subpage_proto_walk = {
+   .mm = mm,
+   .pmd_entry = subpage_walk_pmd_entry,
+   };
+
+   /*
+* We don't try too hard, we just mark all the vma in that range
+* VM_NOHUGEPAGE and split them.
+*/
+   vma = find_vma(mm, addr);
+   /*
+* If the range is in unmapped range, just return
+*/
+   if (vma && ((addr + len) <= vma->vm_start))
+   return;
+
+   while (vma) {
+   if (vma->vm_start >= (addr + len))
+   break;
+   vma->vm_flags |= VM_NOHUGEPAGE;
+   subpage_proto_walk.private = vma;
+   walk_page_range(vma->vm_start, vma->vm_end,
+   &subpage_proto_walk);
+   vma = vma->vm_next;
+   }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+   unsigned long len)
+{
+   return;
+}
+#endif
+
 /*
  * Copy in a subpage protection map for an address range.
  * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long 
len, u32 __user *map)
return -EFAULT;
 
down_write(&mm->mmap_sem);
+   subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
err = -ENOMEM;
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 11/15] powerpc: Prevent gcc to re-read the pagetables

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

GCC is very likely to read the pagetables just once and cache them in
the local stack or in a register, but it is can also decide to re-read
the pagetables. The problem is that the pagetable in those places can
change from under gcc.

With THP/hugetlbfs the pmd (and pgd for hugetlbfs giga pages) can
change under gup_fast. The pages won't be freed untill we finish
gup fast because we have irq disabled and we free these pages via
rcu callback.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/gup.c | 8 
 arch/powerpc/mm/hugetlbpage.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 223a255..49822d9 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long 
addr,
 
ptep = pte_offset_kernel(&pmd, addr);
do {
-   pte_t pte = *ptep;
+   pte_t pte = ACCESS_ONCE(*ptep);
struct page *page;
 
if ((pte_val(pte) & mask) != result)
@@ -63,7 +63,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
 
pmdp = pmd_offset(&pud, addr);
do {
-   pmd_t pmd = *pmdp;
+   pmd_t pmd = ACCESS_ONCE(*pmdp);
 
next = pmd_addr_end(addr, end);
/*
@@ -97,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, 
unsigned long end,
 
pudp = pud_offset(&pgd, addr);
do {
-   pud_t pud = *pudp;
+   pud_t pud = ACCESS_ONCE(*pudp);
 
next = pud_addr_end(addr, end);
if (pud_none(pud))
@@ -160,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, 
int write,
 
pgdp = pgd_offset(mm, addr);
do {
-   pgd_t pgd = *pgdp;
+   pgd_t pgd = ACCESS_ONCE(*pgdp);
 
pr_devel("  %016lx: normal pgd %p\n", addr,
 (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e9e6882..f2f01fd 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1024,7 +1024,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned 
long addr,
if (pte_end < end)
end = pte_end;
 
-   pte = *ptep;
+   pte = ACCESS_ONCE(*ptep);
mask = _PAGE_PRESENT | _PAGE_USER;
if (write)
mask |= _PAGE_RW;
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 10/15] powerpc: Make linux pagetable walk safe with THP enabled

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We need to have irqs disabled to handle all the possible parallel update for
linux page table without holding locks.

Events that we are intersted in while walking page tables are
1) Page fault
2) umap
3) THP split
4) THP collapse

A) local_irq_disabled:

1) page fault:
A none to valid transition via page fault is not an issue because we
would either see a none or valid. If it is none, we would error out
the page table walk. We may need to use on stack values when checking for
type of page table elements, because if we do

if (!is_hugepd()) {
if (!pmd_none() {
   if (pmd_bad() {

We could take that bad condition because the pmd got converted to a hugepd
after the !is_hugepd check via a hugetlb fault.

The right way would be to check for pmd_none higher up or use on stack value.

2) A valid to none conversion via unmap:
We can safely walk the upper level table, because we don't remove the the
page table entries until rcu grace period. So even if we followed a
wrong pointer we still have the pointer valid till the grace period.

A PTE pointer returned need to be atomically checked for _PAGE_PRESENT and
 _PAGE_BUSY. A valid pointer returned could becoming none later. To prevent
pte_clear we take _PAGE_BUSY.

3) THP split:
A valid transparent hugepage is converted to nomal page. Before we split we
do pmd_splitting_flush, which sets the hugepage PTE to _PAGE_SPLITTING
So when walking page table we need to check for pmd_trans_splitting and
handle that. The pte returned should also need to be checked for
_PAGE_SPLITTING before setting _PAGE_BUSY similar to _PAGE_PRESENT. We save
the value of PTE on stack and check for the flag in the local pte value.
If we don't have the value set we can safely operate on the local pte value
and we atomicaly set _PAGE_BUSY.

4) THP collapse:
A normal page gets converted to hugepage. In the collapse path, we
mark the pmd none early (pmdp_clear_flush). With irq disabled, if we
are aleady walking page table we would see the pmd_none and won't continue.
If we see a valid PMD, we should still check for _PAGE_PRESENT before
setting _PAGE_BUSY, to make sure we didn't collapse the PTE to a Huge PTE.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/hash_utils_64.c   | 27 ---
 arch/powerpc/mm/hugepage-hash64.c |  3 ++
 arch/powerpc/mm/hugetlbpage.c | 72 +--
 arch/powerpc/mm/mem.c |  4 +++
 4 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a81e86..8452316 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1180,13 +1180,25 @@ void hash_preload(struct mm_struct *mm, unsigned long 
ea,
pgdir = mm->pgd;
if (pgdir == NULL)
return;
+
+   /* Get VSID */
+   ssize = user_segment_size(ea);
+   vsid = get_vsid(mm->context.id, ea, ssize);
+   if (!vsid)
+   return;
+   /*
+* Hash doesn't like irqs. Walking linux page table with irq disabled
+* saves us from holding multiple locks.
+*/
+   local_irq_save(flags);
+
/*
 * THP pages use update_mmu_cache_pmd. We don't do
 * hash preload there. Hence can ignore THP here
 */
ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
if (!ptep)
-   return;
+   goto out_exit;
 
WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
@@ -1197,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 * page size demotion here
 */
if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
-   return;
+   goto out_exit;
 #endif /* CONFIG_PPC_64K_PAGES */
 
-   /* Get VSID */
-   ssize = user_segment_size(ea);
-   vsid = get_vsid(mm->context.id, ea, ssize);
-   if (!vsid)
-   return;
-
-   /* Hash doesn't like irqs */
-   local_irq_save(flags);
-
/* Is that local to this CPU ? */
if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id(
local = 1;
@@ -1230,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
   mm->context.user_psize,
   mm->context.user_psize,
   pte_val(*ptep));
-
+out_exit:
local_irq_restore(flags);
 }
 
diff --git a/arch/powerpc/mm/hugepage-hash64.c 
b/arch/powerpc/mm/hugepage-hash64.c
index 3c22fa3..34de9e0 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -37,6 +37,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, 
unsigned long vsid,
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
+   /* If PMD is trans splitting retr

[PATCH -V11 12/15] powerpc: disable assert_pte_locked for collapse_huge_page

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

With THP we set pmd to none, before we do pte_clear. Hence we can't
walk page table to get the pte lock ptr and verify whether it is locked.
THP do take pte lock before calling pte_clear. So we don't change the locking
rules here. It is that we can't use page table walking to check whether
pte locks are held with THP.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/pgtable.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a..edda589 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long 
addr)
pud = pud_offset(pgd, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
+   /*
+* khugepaged to collapse normal pages to hugepage, first set
+* pmd to none to force page fault/gup to take mmap_sem. After
+* pmd is set to none, we do a pte_clear which does this assertion
+* so if we find pmd none, return.
+*/
+   if (pmd_none(*pmd))
+   return;
BUG_ON(!pmd_present(*pmd));
assert_spin_locked(pte_lockptr(mm, pmd));
 }
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 14/15] powerpc/THP: Enable THP on PPC64

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

We enable only if the we support 16MB page size.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgtable-ppc64.h |  3 +--
 arch/powerpc/mm/pgtable_64.c | 29 +
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index e71bd25..46db094 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -444,8 +444,7 @@ static inline int pmd_trans_splitting(pmd_t pmd)
return 0;
 }
 
-/* We will enable it in the last patch */
-#define has_transparent_hugepage() 0
+extern int has_transparent_hugepage(void);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline pte_t pmd_pte(pmd_t pmd)
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e4d3e9f..074a4a2 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -831,4 +831,33 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
memset(pgtable, 0, PTE_FRAG_SIZE);
return old_pmd;
 }
+
+int has_transparent_hugepage(void)
+{
+   if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+   return 0;
+   /*
+* We support THP only if PMD_SIZE is 16MB.
+*/
+   if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+   return 0;
+   /*
+* We need to make sure that we support 16MB hugepage in a segement
+* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+* of 64K.
+*/
+   /*
+* If we have 64K HPTE, we will be using that by default
+*/
+   if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+   (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+   return 0;
+   /*
+* Ok we only have 4K HPTE
+*/
+   if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+   return 0;
+
+   return 1;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V11 15/15] powerpc: Optimize hugepage invalidate

2013-06-20 Thread Aneesh Kumar K.V
From: "Aneesh Kumar K.V" 

Hugepage invalidate involves invalidating multiple hpte entries.
Optimize the operation using H_BULK_REMOVE on lpar platforms.
On native, reduce the number of tlb flush.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/machdep.h|   3 +
 arch/powerpc/mm/hash_native_64.c  |  73 
 arch/powerpc/mm/pgtable_64.c  |  12 +++-
 arch/powerpc/platforms/pseries/lpar.c | 122 --
 4 files changed, 201 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 801e3c6..8b48090 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -57,6 +57,9 @@ struct machdep_calls {
void(*hpte_removebolted)(unsigned long ea,
 int psize, int ssize);
void(*flush_hash_range)(unsigned long number, int local);
+   void(*hugepage_invalidate)(struct mm_struct *mm,
+  unsigned char *hpte_slot_array,
+  unsigned long addr, int psize);
 
/* special for kexec, to be called in real mode, linear mapping is
 * destroyed as well */
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 6d152bc..3f0c30a 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -407,6 +407,78 @@ static void native_hpte_invalidate(unsigned long slot, 
unsigned long vpn,
local_irq_restore(flags);
 }
 
+static void native_hugepage_invalidate(struct mm_struct *mm,
+  unsigned char *hpte_slot_array,
+  unsigned long addr, int psize)
+{
+   int ssize = 0, i;
+   int lock_tlbie;
+   struct hash_pte *hptep;
+   int actual_psize = MMU_PAGE_16M;
+   unsigned int max_hpte_count, valid;
+   unsigned long flags, s_addr = addr;
+   unsigned long hpte_v, want_v, shift;
+   unsigned long hidx, vpn = 0, vsid, hash, slot;
+
+   shift = mmu_psize_defs[psize].shift;
+   max_hpte_count = 1U << (PMD_SHIFT - shift);
+
+   local_irq_save(flags);
+   for (i = 0; i < max_hpte_count; i++) {
+   valid = hpte_valid(hpte_slot_array, i);
+   if (!valid)
+   continue;
+   hidx =  hpte_hash_index(hpte_slot_array, i);
+
+   /* get the vpn */
+   addr = s_addr + (i * (1ul << shift));
+   if (!is_kernel_addr(addr)) {
+   ssize = user_segment_size(addr);
+   vsid = get_vsid(mm->context.id, addr, ssize);
+   WARN_ON(vsid == 0);
+   } else {
+   vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+   ssize = mmu_kernel_ssize;
+   }
+
+   vpn = hpt_vpn(addr, vsid, ssize);
+   hash = hpt_hash(vpn, shift, ssize);
+   if (hidx & _PTEIDX_SECONDARY)
+   hash = ~hash;
+
+   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+   slot += hidx & _PTEIDX_GROUP_IX;
+
+   hptep = htab_address + slot;
+   want_v = hpte_encode_avpn(vpn, psize, ssize);
+   native_lock_hpte(hptep);
+   hpte_v = hptep->v;
+
+   /* Even if we miss, we need to invalidate the TLB */
+   if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+   native_unlock_hpte(hptep);
+   else
+   /* Invalidate the hpte. NOTE: this also unlocks it */
+   hptep->v = 0;
+   }
+   /*
+* Since this is a hugepage, we just need a single tlbie.
+* use the last vpn.
+*/
+   lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+   if (lock_tlbie)
+   raw_spin_lock(&native_tlbie_lock);
+
+   asm volatile("ptesync":::"memory");
+   __tlbie(vpn, psize, actual_psize, ssize);
+   asm volatile("eieio; tlbsync; ptesync":::"memory");
+
+   if (lock_tlbie)
+   raw_spin_unlock(&native_tlbie_lock);
+
+   local_irq_restore(flags);
+}
+
 static inline int __hpte_actual_psize(unsigned int lp, int psize)
 {
int i, shift;
@@ -640,4 +712,5 @@ void __init hpte_init_native(void)
ppc_md.hpte_remove  = native_hpte_remove;
ppc_md.hpte_clear_all   = native_hpte_clear;
ppc_md.flush_hash_range = native_flush_hash_range;
+   ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 074a4a2..536eec72 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -708,6 +708,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned 
long 

[PATCH 2/5] powernv/opal: Disable OPAL notifier upon poweroff

2013-06-20 Thread Gavin Shan
While we're restarting or powering off the system, we needn't
the OPAL notifier any more. So just to disable that.

Signed-off-by: Gavin Shan 
---
 arch/powerpc/platforms/powernv/setup.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index d4459bf..84438af 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -93,6 +93,8 @@ static void  __noreturn pnv_restart(char *cmd)
 {
long rc = OPAL_BUSY;
 
+   opal_notifier_disable();
+
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_cec_reboot();
if (rc == OPAL_BUSY_EVENT)
@@ -108,6 +110,8 @@ static void __noreturn pnv_power_off(void)
 {
long rc = OPAL_BUSY;
 
+   opal_notifier_disable();
+
while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
rc = opal_cec_power_down(0);
if (rc == OPAL_BUSY_EVENT)
-- 
1.7.5.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/5] powernv/opal: Notifier for OPAL events

2013-06-20 Thread Gavin Shan
This patch implements a notifier to receive a notification on OPAL
event mask changes. The notifier is only called as a result of an OPAL
interrupt, which will happen upon reception of FSP messages or PCI errors.
Any event mask change detected as a result of opal_poll_events() will not
result in a notifier call.

[benh: changelog]
Signed-off-by: Gavin Shan 
---
 arch/powerpc/include/asm/opal.h   |5 ++
 arch/powerpc/platforms/powernv/opal.c |   69 -
 2 files changed, 73 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 2880797..029fe85 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -644,6 +644,11 @@ extern void hvc_opal_init_early(void);
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
   int depth, void *data);
 
+extern int opal_notifier_register(struct notifier_block *nb);
+extern void opal_notifier_enable(void);
+extern void opal_notifier_disable(void);
+extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
+
 extern int opal_get_chars(uint32_t vtermno, char *buf, int count);
 extern int opal_put_chars(uint32_t vtermno, const char *buf, int total_len);
 
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 628c564..106301f 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -31,6 +32,10 @@ static DEFINE_SPINLOCK(opal_write_lock);
 extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
+static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static DEFINE_SPINLOCK(opal_notifier_lock);
+static uint64_t last_notified_mask = 0x0ul;
+static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
 
 int __init early_init_dt_scan_opal(unsigned long node,
   const char *uname, int depth, void *data)
@@ -95,6 +100,68 @@ static int __init opal_register_exception_handlers(void)
 
 early_initcall(opal_register_exception_handlers);
 
+int opal_notifier_register(struct notifier_block *nb)
+{
+   if (!nb) {
+   pr_warning("%s: Invalid argument (%p)\n",
+  __func__, nb);
+   return -EINVAL;
+   }
+
+   atomic_notifier_chain_register(&opal_notifier_head, nb);
+   return 0;
+}
+
+static void opal_do_notifier(uint64_t events)
+{
+   unsigned long flags;
+   uint64_t changed_mask;
+
+   if (atomic_read(&opal_notifier_hold))
+   return;
+
+   spin_lock_irqsave(&opal_notifier_lock, flags);
+   changed_mask = last_notified_mask ^ events;
+   last_notified_mask = events;
+   spin_unlock_irqrestore(&opal_notifier_lock, flags);
+
+   /*
+* We feed with the event bits and changed bits for
+* enough information to the callback.
+*/
+   atomic_notifier_call_chain(&opal_notifier_head,
+  events, (void *)changed_mask);
+}
+
+void opal_notifier_update_evt(uint64_t evt_mask,
+ uint64_t evt_val)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&opal_notifier_lock, flags);
+   last_notified_mask &= ~evt_mask;
+   last_notified_mask |= evt_val;
+   spin_unlock_irqrestore(&opal_notifier_lock, flags);
+}
+
+void opal_notifier_enable(void)
+{
+   int64_t rc;
+   uint64_t evt = 0;
+
+   atomic_set(&opal_notifier_hold, 0);
+
+   /* Process pending events */
+   rc = opal_poll_events(&evt);
+   if (rc == OPAL_SUCCESS && evt)
+   opal_do_notifier(evt);
+}
+
+void opal_notifier_disable(void)
+{
+   atomic_set(&opal_notifier_hold, 1);
+}
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
s64 len, rc;
@@ -297,7 +364,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
 
opal_handle_interrupt(virq_to_hw(irq), &events);
 
-   /* XXX TODO: Do something with the events */
+   opal_do_notifier(events);
 
return IRQ_HANDLED;
 }
-- 
1.7.5.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 5/5] powerpc/eeh: Debugfs for error injection

2013-06-20 Thread Gavin Shan
The patch creates debugfs entries (powerpc/PCI/err_injct) for
injecting EEH errors for testing purpose.

Signed-off-by: Gavin Shan 
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   31 +
 1 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 2b7689e..84f3036 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -12,6 +12,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -64,6 +65,29 @@ static struct notifier_block ioda_eeh_nb = {
.priority   = 0
 };
 
+#ifdef CONFIG_DEBUG_FS
+static int ioda_eeh_dbgfs_set(void *data, u64 val)
+{
+   struct pci_controller *hose = data;
+   struct pnv_phb *phb = hose->private_data;
+
+   out_be64(phb->regs + 0xD10, val);
+   return 0;
+}
+
+static int ioda_eeh_dbgfs_get(void *data, u64 *val)
+{
+   struct pci_controller *hose = data;
+   struct pnv_phb *phb = hose->private_data;
+
+   *val = in_be64(phb->regs + 0xD10);
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_dbgfs_ops, ioda_eeh_dbgfs_get,
+   ioda_eeh_dbgfs_set, "0x%llx\n");
+#endif /* CONFIG_DEBUG_FS */
+
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
  * @hose: PCI controller
@@ -101,6 +125,13 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
}
}
 
+#ifdef CONFIG_DEBUG_FS
+   if (phb->dbgfs)
+   debugfs_create_file("err_injct", 0600,
+   phb->dbgfs, hose,
+   &ioda_eeh_dbgfs_ops);
+#endif
+
phb->eeh_enabled = 1;
}
 
-- 
1.7.5.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH v7 0/5] EEH Support for PowerNV platform

2013-06-20 Thread Gavin Shan

It resends the last 5 patches ([27/31] - [31/31]) of the series to support EEH
for PowerNV platform.

v6 -> v7:
* Use atomic notifier to replace the original OPAL notifier according
  to Ben's suggestion.
* Avoid registering duplicated notifiers for OPAL_EVENT_PCI_ERROR.

---

arch/powerpc/include/asm/opal.h   |5 ++
arch/powerpc/platforms/powernv/eeh-ioda.c |   72 -
arch/powerpc/platforms/powernv/opal.c |   69 +++-
arch/powerpc/platforms/powernv/pci-ioda.c |   23 +
arch/powerpc/platforms/powernv/pci.h  |4 ++
arch/powerpc/platforms/powernv/setup.c|4 ++
6 files changed, 175 insertions(+), 2 deletions(-)

Thanks,
Gavin

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 0/6] powerpc/book3e: powerpc/book3e: make kgdb to work well

2013-06-20 Thread Tiejun Chen
Ben,

As you mention just now, I resend this another pending patch sets v5 used
to support kgdb/gdb on book3e.

v5:

* rebase on merge branch.

Note the original patch, [ATCH 5/7] kgdb/kgdbts: support ppc64, is already 
merged
by Jason.

v4:

* use DEFINE_PER_CPU to allocate kgdb's thread_info
* add patch 7 to make usre copy thread_info only !__check_irq_replay
* leave "andi.   r14,r11,MSR_PR" out of "#ifndef CONFIG_KGDB"
  since cr0 is still used lately.
* retest

v3:

* make work when enable CONFIG_RELOCATABLE
* fix one typo in patch,
  "powerpc/book3e: store critical/machine/debug exception thread info": 
ld  r1,PACAKSAVE(r13);
->  ld  r14,PACAKSAVE(r13);
* remove copying the thread_info since booke and book3e always copy
  the thead_info now when we enter the debug exception, and so drop
  the v2 patch, "book3e/kgdb: Fix a single stgep case of lazy IRQ"

v2:

* Make sure we cover CONFIG_PPC_BOOK3E_64 safely
* Use LOAD_REG_IMMEDIATE() to load properly
the value of the constant expression in load debug exception stack 
* Copy thread infor form the kernel stack coming from usr
* Rebase latest powerpc git tree

v1:
* Copy thread info only when we are from !user mode since we'll get kernel stack
  coming from usr directly.
* remove save/restore EX_R14/EX_R15 since DBG_EXCEPTION_PROLOG already covered
  this.
* use CURRENT_THREAD_INFO() conveniently to get thread.
* fix some typos
* add a patch to make sure gdb can generate a single step properly to invoke a
  kgdb state.
* add a patch to if we need to replay an interrupt, we shouldn't restore that
  previous backup thread info to make sure we can replay an interrupt lately
  with a proper thread info.
* rebase latest powerpc git tree

v0:
This patchset is used to support kgdb for book3e.

--
Tiejun Chen (6):
  powerpc/book3e: load critical/machine/debug exception stack
  powerpc/book3e: store critical/machine/debug exception thread info
  book3e/kgdb: update thread's dbcr0
  powerpc/book3e: support kgdb for kernel space
  powerpc/kgdb: use DEFINE_PER_CPU to allocate kgdb's thread_info
  book3e/kgdb: Fix a single stgep case of lazy IRQ

 arch/powerpc/kernel/exceptions-64e.S |   68 --
 arch/powerpc/kernel/irq.c|   10 +
 arch/powerpc/kernel/kgdb.c   |   21 +++
 3 files changed, 88 insertions(+), 11 deletions(-)

Tiejun
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 2/6] powerpc/book3e: store critical/machine/debug exception thread info

2013-06-20 Thread Tiejun Chen
We need to store thread info to these exception thread info like something
we already did for PPC32.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/exceptions-64e.S |   15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4d8e57f..07cf657 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -67,6 +67,18 @@
std r10,PACA_##level##_STACK(r13);
 #endif
 
+/* Store something to exception thread info */
+#defineBOOK3E_STORE_EXC_LEVEL_THEAD_INFO(type) 
\
+   ld  r14,PACAKSAVE(r13); 
\
+   CURRENT_THREAD_INFO(r14, r14);  
\
+   CURRENT_THREAD_INFO(r15, r1);   
\
+   ld  r10,TI_FLAGS(r14);  
\
+   std r10,TI_FLAGS(r15);  
\
+   ld  r10,TI_PREEMPT(r14);
\
+   std r10,TI_PREEMPT(r15);
\
+   ld  r10,TI_TASK(r14);   
\
+   std r10,TI_TASK(r15);
+
 /* Exception prolog code for all exceptions */
 #define EXCEPTION_PROLOG(n, intnum, type, addition)\
mtspr   SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */   \
@@ -104,6 +116,7 @@
BOOK3E_LOAD_EXC_LEVEL_STACK(CRIT);  
\
ld  r1,PACA_CRIT_STACK(r13);\
subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+   BOOK3E_STORE_EXC_LEVEL_THEAD_INFO(CRIT);
\
 1:
 #define SPRN_CRIT_SRR0 SPRN_CSRR0
 #define SPRN_CRIT_SRR1 SPRN_CSRR1
@@ -114,6 +127,7 @@
BOOK3E_LOAD_EXC_LEVEL_STACK(DBG);   
\
ld  r1,PACA_DBG_STACK(r13); \
subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+   BOOK3E_STORE_EXC_LEVEL_THEAD_INFO(DBG); 
\
 1:
 #define SPRN_DBG_SRR0  SPRN_DSRR0
 #define SPRN_DBG_SRR1  SPRN_DSRR1
@@ -124,6 +138,7 @@
BOOK3E_LOAD_EXC_LEVEL_STACK(MC);
\
ld  r1,PACA_MC_STACK(r13);  \
subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+   BOOK3E_STORE_EXC_LEVEL_THEAD_INFO(MC);  
\
 1:
 #define SPRN_MC_SRR0   SPRN_MCSRR0
 #define SPRN_MC_SRR1   SPRN_MCSRR1
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 1/6] powerpc/book3e: load critical/machine/debug exception stack

2013-06-20 Thread Tiejun Chen
We always alloc critical/machine/debug check exceptions. This is
different from the normal exception. So we should load these exception
stack properly like we did for booke.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/exceptions-64e.S |   49 +++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 4b23119..4d8e57f 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -36,6 +36,37 @@
  */
 #defineSPECIAL_EXC_FRAME_SIZE  INT_FRAME_SIZE
 
+/* only on book3e */
+#define DBG_STACK_BASE dbgirq_ctx
+#define MC_STACK_BASE  mcheckirq_ctx
+#define CRIT_STACK_BASEcritirq_ctx
+
+#ifdef CONFIG_RELOCATABLE
+#define LOAD_STACK_BASE(reg, level)\
+   tovirt(r2,r2);  \
+   LOAD_REG_ADDR(reg, level##_STACK_BASE);
+#else
+#define LOAD_STACK_BASE(reg, level)\
+   LOAD_REG_IMMEDIATE(reg, level##_STACK_BASE);
+#endif
+
+#ifdef CONFIG_SMP
+#define BOOK3E_LOAD_EXC_LEVEL_STACK(level) \
+   mfspr   r14,SPRN_PIR;   \
+   slwir14,r14,3;  \
+   LOAD_STACK_BASE(r10, level);\
+   add r10,r10,r14;\
+   ld  r10,0(r10); \
+   addir10,r10,THREAD_SIZE;\
+   std r10,PACA_##level##_STACK(r13);
+#else
+#define BOOK3E_LOAD_EXC_LEVEL_STACK(level) \
+   LOAD_STACK_BASE(r10, level);\
+   ld  r10,0(r10); \
+   addir10,r10,THREAD_SIZE;\
+   std r10,PACA_##level##_STACK(r13);
+#endif
+
 /* Exception prolog code for all exceptions */
 #define EXCEPTION_PROLOG(n, intnum, type, addition)\
mtspr   SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */   \
@@ -68,20 +99,32 @@
 #define SPRN_GDBELL_SRR1   SPRN_GSRR1
 
 #define CRIT_SET_KSTACK
\
+   andi.   r10,r11,MSR_PR; 
\
+   bne 1f; 
\
+   BOOK3E_LOAD_EXC_LEVEL_STACK(CRIT);  
\
ld  r1,PACA_CRIT_STACK(r13);\
-   subir1,r1,SPECIAL_EXC_FRAME_SIZE;
+   subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+1:
 #define SPRN_CRIT_SRR0 SPRN_CSRR0
 #define SPRN_CRIT_SRR1 SPRN_CSRR1
 
 #define DBG_SET_KSTACK \
+   andi.   r10,r11,MSR_PR; 
\
+   bne 1f; 
\
+   BOOK3E_LOAD_EXC_LEVEL_STACK(DBG);   
\
ld  r1,PACA_DBG_STACK(r13); \
-   subir1,r1,SPECIAL_EXC_FRAME_SIZE;
+   subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+1:
 #define SPRN_DBG_SRR0  SPRN_DSRR0
 #define SPRN_DBG_SRR1  SPRN_DSRR1
 
 #define MC_SET_KSTACK  \
+   andi.   r10,r11,MSR_PR; 
\
+   bne 1f; 
\
+   BOOK3E_LOAD_EXC_LEVEL_STACK(MC);
\
ld  r1,PACA_MC_STACK(r13);  \
-   subir1,r1,SPECIAL_EXC_FRAME_SIZE;
+   subir1,r1,SPECIAL_EXC_FRAME_SIZE;   
\
+1:
 #define SPRN_MC_SRR0   SPRN_MCSRR0
 #define SPRN_MC_SRR1   SPRN_MCSRR1
 
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 4/6] powerpc/book3e: support kgdb for kernel space

2013-06-20 Thread Tiejun Chen
Currently we need to skip this for supporting KGDB.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/exceptions-64e.S |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 07cf657..a286b51 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -639,11 +639,13 @@ kernel_dbg_exc:
rfdi
 
/* Normal debug exception */
+1: andi.   r14,r11,MSR_PR; /* check for userspace again */
+#ifndef CONFIG_KGDB
/* XXX We only handle coming from userspace for now since we can't
 * quite save properly an interrupted kernel state yet
 */
-1: andi.   r14,r11,MSR_PR; /* check for userspace again */
beq kernel_dbg_exc; /* if from kernel mode */
+#endif
 
/* Now we mash up things to make it look like we are coming on a
 * normal exception
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 3/6] book3e/kgdb: update thread's dbcr0

2013-06-20 Thread Tiejun Chen
gdb always need to generate a single step properly to invoke
a kgdb state. But with lazy interrupt, book3e can't always
trigger a debug exception with a single step since the current
is blocked for handling those pending exception, then we miss
that expected dbcr configuration at last to generate a debug
exception.

So here we also update thread's dbcr0 to make sure the current
can go back with that missed dbcr0 configuration.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/kgdb.c |   13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index c1eef24..55409ac 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -410,7 +410,7 @@ int kgdb_arch_handle_exception(int vector, int signo, int 
err_code,
   struct pt_regs *linux_regs)
 {
char *ptr = &remcom_in_buffer[1];
-   unsigned long addr;
+   unsigned long addr, dbcr0;
 
switch (remcom_in_buffer[0]) {
/*
@@ -427,8 +427,15 @@ int kgdb_arch_handle_exception(int vector, int signo, int 
err_code,
/* set the trace bit if we're stepping */
if (remcom_in_buffer[0] == 's') {
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
-   mtspr(SPRN_DBCR0,
- mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
+   dbcr0 = mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM;
+   mtspr(SPRN_DBCR0, dbcr0);
+#ifdef CONFIG_PPC_BOOK3E_64
+   /* With lazy interrut we have to update thread dbcr0 
here
+* to make sure we can set debug properly at last to 
invoke
+* kgdb again to work well.
+*/
+   current->thread.dbcr0 = dbcr0;
+#endif
linux_regs->msr |= MSR_DE;
 #else
linux_regs->msr |= MSR_SE;
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 5/6] powerpc/kgdb: use DEFINE_PER_CPU to allocate kgdb's thread_info

2013-06-20 Thread Tiejun Chen
Use DEFINE_PER_CPU to allocate thread_info statically instead of kmalloc().
This can avoid introducing more memory check codes.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/kgdb.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 55409ac..cde7818 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -151,15 +151,15 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
return 1;
 }
 
+static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
 static int kgdb_singlestep(struct pt_regs *regs)
 {
struct thread_info *thread_info, *exception_thread_info;
-   struct thread_info *backup_current_thread_info;
+   struct thread_info *backup_current_thread_info = 
&__get_cpu_var(kgdb_thread_info);
 
if (user_mode(regs))
return 0;
 
-   backup_current_thread_info = kmalloc(sizeof(struct thread_info), 
GFP_KERNEL);
/*
 * On Book E and perhaps other processors, singlestep is handled on
 * the critical exception stack.  This causes current_thread_info()
@@ -185,7 +185,6 @@ static int kgdb_singlestep(struct pt_regs *regs)
/* Restore current_thread_info lastly. */
memcpy(exception_thread_info, backup_current_thread_info, 
sizeof *thread_info);
 
-   kfree(backup_current_thread_info);
return 1;
 }
 
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v7 0/5] EEH Support for PowerNV platform

2013-06-20 Thread Gavin Shan
On Thu, Jun 20, 2013 at 06:13:21PM +0800, Gavin Shan wrote:
>
>It resends the last 5 patches ([27/31] - [31/31]) of the series to support EEH
>for PowerNV platform.
>
>v6 -> v7:
>   * Use atomic notifier to replace the original OPAL notifier according
> to Ben's suggestion.
>   * Avoid registering duplicated notifiers for OPAL_EVENT_PCI_ERROR.
>

It also fixes the problem pointed by Mikey:

* Included  in pci-ioda.c for correct reference to
  "powerpc_debugfs_root".

Thanks,
Gavin

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[v5][PATCH 6/6] book3e/kgdb: Fix a single stgep case of lazy IRQ

2013-06-20 Thread Tiejun Chen
When we're in kgdb_singlestep(), we have to work around to get
thread_info by copying from the kernel stack before calling
kgdb_handle_exception(), then copying it back afterwards.

But for PPC64, we have a lazy interrupt implementation. So after
copying thread info frome kernle stack, if we need to replay an
interrupt, we shouldn't restore that previous backup thread info
to make sure we can replay an interrupt lately with a proper
thread info.

This patch use __check_irq_replay() to guarantee this process.

Signed-off-by: Tiejun Chen 
---
 arch/powerpc/kernel/irq.c  |   10 ++
 arch/powerpc/kernel/kgdb.c |3 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ea185e0..3625453 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -339,7 +339,17 @@ bool prep_irq_for_idle(void)
return true;
 }
 
+notrace unsigned int check_irq_replay(void)
+{
+   return __check_irq_replay();
+}
+#else
+notrace unsigned int check_irq_replay(void)
+{
+   return 0;
+}
 #endif /* CONFIG_PPC64 */
+EXPORT_SYMBOL(check_irq_replay);
 
 int arch_show_interrupts(struct seq_file *p, int prec)
 {
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index cde7818..5b30408 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -152,6 +152,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
 }
 
 static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info);
+extern notrace unsigned int check_irq_replay(void);
 static int kgdb_singlestep(struct pt_regs *regs)
 {
struct thread_info *thread_info, *exception_thread_info;
@@ -181,7 +182,7 @@ static int kgdb_singlestep(struct pt_regs *regs)
 
kgdb_handle_exception(0, SIGTRAP, 0, regs);
 
-   if (thread_info != exception_thread_info)
+   if ((thread_info != exception_thread_info) && (!check_irq_replay()))
/* Restore current_thread_info lastly. */
memcpy(exception_thread_info, backup_current_thread_info, 
sizeof *thread_info);
 
-- 
1.7.9.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 4/5] powerpc/powernv: Debugfs directory for PHB

2013-06-20 Thread Gavin Shan
The patch creates one debugfs directory ("powerpc/PCI") for
each PHB so that we can hook EEH error injection debugfs entry
there in proceeding patch.

Signed-off-by: Gavin Shan 
---
 arch/powerpc/platforms/powernv/pci-ioda.c |   23 +++
 arch/powerpc/platforms/powernv/pci.h  |4 
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 48b0940..3e5c3d5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -13,6 +13,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -32,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "powernv.h"
 #include "pci.h"
@@ -968,12 +970,33 @@ static void pnv_pci_ioda_setup_DMA(void)
}
 }
 
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+   struct pci_controller *hose, *tmp;
+   struct pnv_phb *phb;
+   char name[16];
+
+   list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+   phb = hose->private_data;
+
+   sprintf(name, "PCI%04x", hose->global_number);
+   phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+   if (!phb->dbgfs)
+   pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+   __func__, hose->global_number);
+   }
+#endif /* CONFIG_DEBUG_FS */
+}
+
 static void pnv_pci_ioda_fixup(void)
 {
pnv_pci_ioda_setup_PEs();
pnv_pci_ioda_setup_seg();
pnv_pci_ioda_setup_DMA();
 
+   pnv_pci_ioda_create_dbgfs();
+
 #ifdef CONFIG_EEH
eeh_addr_cache_build();
eeh_init();
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 3656a240..43906e3 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -96,6 +96,10 @@ struct pnv_phb {
int removed;
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+   struct dentry   *dbgfs;
+#endif
+
 #ifdef CONFIG_PCI_MSI
unsigned intmsi_base;
unsigned intmsi32_support;
-- 
1.7.5.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/5] powerpc/eeh: Register OPAL notifier for PCI error

2013-06-20 Thread Gavin Shan
The patch registers OPAL event notifier and process the PCI errors
from firmware. If we have pending PCI errors, special EEH event
(without binding PE) will be sent to EEH core for processing.

Signed-off-by: Gavin Shan 
---
 arch/powerpc/platforms/powernv/eeh-ioda.c |   41 -
 1 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index a3eebd1..2b7689e 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -42,6 +43,26 @@
 #endif
 
 static char *hub_diag = NULL;
+static int ioda_eeh_nb_init = 0;
+
+static int ioda_eeh_event(struct notifier_block *nb,
+ unsigned long events, void *change)
+{
+   uint64_t changed_evts = (uint64_t)change;
+
+   /* We simply send special EEH event */
+   if ((changed_evts & OPAL_EVENT_PCI_ERROR) &&
+   (events & OPAL_EVENT_PCI_ERROR))
+   eeh_send_failure_event(NULL);
+
+   return 0;
+}
+
+static struct notifier_block ioda_eeh_nb = {
+   .notifier_call  = ioda_eeh_event,
+   .next   = NULL,
+   .priority   = 0
+};
 
 /**
  * ioda_eeh_post_init - Chip dependent post initialization
@@ -54,6 +75,19 @@ static char *hub_diag = NULL;
 static int ioda_eeh_post_init(struct pci_controller *hose)
 {
struct pnv_phb *phb = hose->private_data;
+   int ret;
+
+   /* Register OPAL event notifier */
+   if (!ioda_eeh_nb_init) {
+   ret = opal_notifier_register(&ioda_eeh_nb);
+   if (ret) {
+   pr_err("%s: Can't register OPAL event notifier (%d)\n",
+  __func__, ret);
+   return ret;
+   }
+
+   ioda_eeh_nb_init = 1;
+   }
 
/* FIXME: Enable it for PHB3 later */
if (phb->type == PNV_PHB_IODA1) {
@@ -736,8 +770,13 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
long rc;
int ret = 1;
 
-   /* While running here, it's safe to purge the event queue */
+   /*
+* While running here, it's safe to purge the event queue.
+* And we should keep the cached OPAL notifier event sychronized
+* between the kernel and firmware.
+*/
eeh_remove_event(NULL);
+   opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
/*
-- 
1.7.5.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: DEBUG_PAGEALLOC on PPC not working (kernels 2.6-25, 3.0-34)

2013-06-20 Thread perth1415
Hi Scott,

Thanks for the reply, though a bit disheartening :-)
My understanding on e500 MMU is not clear. It'd be nice if I could find some
way (may be ad-hoc) to debug some use-after-free page corruptions. SLAB
debug tells me the page was modified by someone after it was freed but
DEBUG_PAGEALLOC would have been more specific, as to tell me where exactly
it was getting modified.
Any debugging clues will be much appreciated.

Regards,
Partha



--
View this message in context: 
http://linuxppc.10917.n7.nabble.com/DEBUG-PAGEALLOC-on-PPC-not-working-kernels-2-6-25-3-0-34-tp72536p72625.html
Sent from the linuxppc-dev mailing list archive at Nabble.com.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 2/2] perf tools: Make Power7 events available for perf

2013-06-20 Thread David Laight
> I think we should be able to do something better using the C
> preprocessor, this is exactly the sort of thing it's good at.
> 
> What I mean is something like we do with arch/powerpc/include/asm/systbl.h,
> where we define the list of syscalls once, and then include it in
> multiple places, using different macro definitions to get different
> outputs.

There is a 'neat' trick - you can pass a #define macro the
name of another #define - which is then expanded after the
initial expansion. A description I happen to have is pasted below.

David

Consider what happens when #define foo(x) x(args) is expanded: foo(bar)
clearly becomes bar(args). If we also have #define bar(args) then bar()
is expanded AFTER foo() allowing us to generate any text including args.
So we have passed the name of one #define as a parameter to a different #define.

If we replace the definition of foo with
#define foo(x) x(args1) x(args2) then foo(bar) is equivalent to
bar(args1) bar(args2).
This is useful because foo(baz) expands to baz(args1) baz(args2)
allowing us to feed the same set of arguments to more than one #define.

A simple example:
#define lights(x) x(red) x(orange) x(green)
#define xx(colour) LIGHT_##colour,
enum { lights(xx) NUM_LIGHTS };
#undef xx
#define xx(colour) #colour,
static const char light_names[] = { lights(xx) };
#undef xx

This expands to:
enum { LIGHT_red, LIGHT_orange, LIGHT_green, NUM_LIGHTS };
static const char light_names[] = { ”red”, ”orange”, ”green”, };
(We needed to add NUM_LIGHTS because a trailing comma isn’t valid in a C++ 
enum.)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/2 v15] iommu/fsl: Add additional iommu attributes required by the PAMU driver.

2013-06-20 Thread Joerg Roedel
Varun,

On Wed, Apr 24, 2013 at 05:05:50PM +0530, Varun Sethi wrote:
> Added the following domain attributes for the FSL PAMU driver:
> 1. Added new iommu stash attribute, which allows setting of the
>LIODN specific stash id parameter through IOMMU API.
> 2. Added an attribute for enabling/disabling DMA to a particular
>memory window.
> 3. Added domain attribute to check for PAMUV1 specific constraints.
> 
> Signed-off-by: Varun Sethi 

Can you please rebase the driver tp v3.10-rc6 and resend asap? I will
give it another review then and request AlexW to look over the
iommu-group stuff. So we can probably get this merged for v3.11.


Thank,

Joerg


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH 2/5 v11] powerpc: Add iommu domain pointer to device archdata

2013-06-20 Thread Sethi Varun-B16395
Hi Joerg,
My PAMU driver patches depend on this patch which was Ack by Kumar. Should I 
resubmit this patch as well?

Regards
Varun

> -Original Message-
> From: Kumar Gala [mailto:ga...@kernel.crashing.org]
> Sent: Thursday, April 11, 2013 11:46 PM
> To: Sethi Varun-B16395
> Cc: j...@8bytes.org; Yoder Stuart-B08248; Wood Scott-B07421;
> io...@lists.linux-foundation.org; linuxppc-dev@lists.ozlabs.org; linux-
> ker...@vger.kernel.org; b...@kernel.crashing.org
> Subject: Re: [PATCH 2/5 v11] powerpc: Add iommu domain pointer to device
> archdata
> 
> 
> On Mar 28, 2013, at 2:53 PM, Varun Sethi wrote:
> 
> > Add an iommu domain pointer to device (powerpc) archdata.  Devices are
> > attached to iommu domains and this pointer provides a mechanism to
> > correlate between a device and the associated iommu domain.  This
> > field is set when a device is attached to a domain.
> >
> > Signed-off-by: Varun Sethi 
> > ---
> > - no change in v11.
> > - no change in v10.
> > - Added CONFIG_IOMMU_API in v9.
> > arch/powerpc/include/asm/device.h |6 ++
> > 1 files changed, 6 insertions(+), 0 deletions(-)
> 
> Acked-by: Kumar Gala 
> 
> - k


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/5 v11] powerpc: Add iommu domain pointer to device archdata

2013-06-20 Thread j...@8bytes.org
On Thu, Jun 20, 2013 at 02:29:30PM +, Sethi Varun-B16395 wrote:
> Hi Joerg,
> My PAMU driver patches depend on this patch which was Ack by Kumar. Should I 
> resubmit this patch as well?

Yes, please. Add the collected Acked-bys and submit everything that is
missing in v3.10-rc6.


Joerg


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-06-20 Thread Alex Williamson
On Thu, 2013-06-20 at 18:48 +1000, Alexey Kardashevskiy wrote:
> On 06/20/2013 05:47 PM, Benjamin Herrenschmidt wrote:
> > On Thu, 2013-06-20 at 15:28 +1000, David Gibson wrote:
> >>> Just out of curiosity - would not get_file() and fput_atomic() on a
> >> group's
> >>> file* do the right job instead of vfio_group_add_external_user() and
> >>> vfio_group_del_external_user()?
> >>
> >> I was thinking that too.  Grabbing a file reference would certainly be
> >> the usual way of handling this sort of thing.
> > 
> > But that wouldn't prevent the group ownership to be returned to
> > the kernel or another user would it ?
> 
> 
> Holding the file pointer does not let the group->container_users counter go
> to zero

How so?  Holding the file pointer means the file won't go away, which
means the group release function won't be called.  That means the group
won't go away, but that doesn't mean it's attached to an IOMMU.  A user
could call UNSET_CONTAINER.

>  and this is exactly what vfio_group_add_external_user() and
> vfio_group_del_external_user() do. The difference is only in absolute value
> - 2 vs. 3.
> 
> No change in behaviour whether I use new vfio API or simply hold file* till
> KVM closes fd created when IOMMU was connected to LIOBN.

By that notion you could open(/dev/vfio/$GROUP) and you're safe, right?
But what about SET_CONTAINER & SET_IOMMU?  All that you guarantee
holding the file pointer is that the vfio_group exists.

> And while this counter is not zero, QEMU cannot take ownership over the group.
>
> I am definitely still missing the bigger picture...

The bigger picture is that the group needs to exist AND it needs to be
setup and maintained to have IOMMU protection.  Actually, my first stab
at add_external_user doesn't look sufficient, it needs to look more like
vfio_group_get_device_fd, checking group->container->iommu and
group_viable().  As written it would allow an external user after
SET_CONTAINER without SET_IOMMU.  It should also be part of the API that
the external user must hold the file reference between add_external_use
and del_external_user and do cleanup on any exit paths.  Thanks,

Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/3 v16] iommu/fsl: Add additional iommu attributes required by the PAMU driver.

2013-06-20 Thread Varun Sethi
Added the following domain attributes for the FSL PAMU driver:
1. Added new iommu stash attribute, which allows setting of the
   LIODN specific stash id parameter through IOMMU API.
2. Added an attribute for enabling/disabling DMA to a particular
   memory window.
3. Added domain attribute to check for PAMUV1 specific constraints.

Signed-off-by: Varun Sethi 
---
v16 changes:
- rebased to 3.10-rc6
v15 changes:
- Moved fsl_pamu_stash.h under arch/powerpc/include/asm.
v14 changes:
- Add FSL prefix to PAMU attributes.
v13 changes:
- created a new file include/linux/fsl_pamu_stash.h for stash
attributes.
v12 changes:
- Moved PAMU specifc stash ids and structures to PAMU header file.
- no change in v11.
- no change in v10.
 arch/powerpc/include/asm/fsl_pamu_stash.h |   39 +
 include/linux/iommu.h |   16 
 2 files changed, 55 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fsl_pamu_stash.h

diff --git a/arch/powerpc/include/asm/fsl_pamu_stash.h 
b/arch/powerpc/include/asm/fsl_pamu_stash.h
new file mode 100644
index 000..caa1b21
--- /dev/null
+++ b/arch/powerpc/include/asm/fsl_pamu_stash.h
@@ -0,0 +1,39 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ */
+
+#ifndef __FSL_PAMU_STASH_H
+#define __FSL_PAMU_STASH_H
+
+/* cache stash targets */
+enum pamu_stash_target {
+   PAMU_ATTR_CACHE_L1 = 1,
+   PAMU_ATTR_CACHE_L2,
+   PAMU_ATTR_CACHE_L3,
+};
+
+/*
+ * This attribute allows configuring stashig specific parameters
+ * in the PAMU hardware.
+ */
+
+struct pamu_stash_attribute {
+   u32 cpu;/* cpu number */
+   u32 cache;  /* cache to stash to: L1,L2,L3 */
+};
+
+#endif  /* __FSL_PAMU_STASH_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3aeb730..7ea319e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -58,10 +58,26 @@ struct iommu_domain {
 #define IOMMU_CAP_CACHE_COHERENCY  0x1
 #define IOMMU_CAP_INTR_REMAP   0x2 /* isolates device intrs */
 
+/*
+ * Following constraints are specifc to FSL_PAMUV1:
+ *  -aperture must be power of 2, and naturally aligned
+ *  -number of windows must be power of 2, and address space size
+ *   of each window is determined by aperture size / # of windows
+ *  -the actual size of the mapped region of a window must be power
+ *   of 2 starting with 4KB and physical address must be naturally
+ *   aligned.
+ * DOMAIN_ATTR_FSL_PAMUV1 corresponds to the above mentioned contraints.
+ * The caller can invoke iommu_domain_get_attr to check if the underlying
+ * iommu implementation supports these constraints.
+ */
+
 enum iommu_attr {
DOMAIN_ATTR_GEOMETRY,
DOMAIN_ATTR_PAGING,
DOMAIN_ATTR_WINDOWS,
+   DOMAIN_ATTR_FSL_PAMU_STASH,
+   DOMAIN_ATTR_FSL_PAMU_ENABLE,
+   DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_MAX,
 };
 
-- 
1.7.4.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/3 v11] powerpc: Add iommu domain pointer to device archdata

2013-06-20 Thread Varun Sethi
Add an iommu domain pointer to device (powerpc) archdata.  Devices
are attached to iommu domains and this pointer provides a mechanism
to correlate between a device and the associated iommu domain.  This
field is set when a device is attached to a domain.

Signed-off-by: Varun Sethi 
Acked-by: Kumar Gala 
---
- no change in v11.
- no change in v10.
- Added CONFIG_IOMMU_API in v9.
 arch/powerpc/include/asm/device.h |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 77e97dd..38faede 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -28,6 +28,9 @@ struct dev_archdata {
void*iommu_table_base;
} dma_data;
 
+#ifdef CONFIG_IOMMU_API
+   void*iommu_domain;
+#endif
 #ifdef CONFIG_SWIOTLB
dma_addr_t  max_direct_dma_addr;
 #endif
-- 
1.7.4.1


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: DEBUG_PAGEALLOC on PPC not working (kernels 2.6-25, 3.0-34)

2013-06-20 Thread Scott Wood

On 06/20/2013 05:42:40 AM, perth1415 wrote:

Hi Scott,

Thanks for the reply, though a bit disheartening :-)
My understanding on e500 MMU is not clear. It'd be nice if I could  
find some
way (may be ad-hoc) to debug some use-after-free page corruptions.  
SLAB

debug tells me the page was modified by someone after it was freed but
DEBUG_PAGEALLOC would have been more specific, as to tell me where  
exactly

it was getting modified.
Any debugging clues will be much appreciated.


If you know an exact address that's being corrupted, you could set a  
data breakpoint (by manually setting the registers, and making sure  
that the exception handler will produce a dump and not ignore it as a  
spurious event).  You could add code to periodically check for  
corruption (from a timer, from codepaths which you suspect,  
before/after IRQ handlers, etc).  If you have specific code that you  
suspect may be responsible, you can have it check for poison values  
before writing.  I'm not sure if slab debugging already does this, but  
if not you could have it record the address of the code that last  
allocated and freed the corrupted memory chunk.


If you have access to a tool such as Virtutech Simics, you could use  
reverse execution to find the corruption.


Or you could find a way to separate the code/data needed by exceptions  
(including page tables, kernel stacks, etc) from everything else, and  
only pin the former, but that's probably a lot of work.


-Scott
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] of: Specify initrd location using 64-bit

2013-06-20 Thread Santosh Shilimkar
On some PAE architectures, the entire range of physical memory could reside
outside the 32-bit limit.  These systems need the ability to specify the
initrd location using 64-bit numbers.

This patch globally modifies the early_init_dt_setup_initrd_arch() function to
use 64-bit numbers instead of the current unsigned long.

Patch is refreshed version of Cyril's original patch against 3.10-rcx which
has few more arch's which needs to be patched. Have to drop Cyril's email
id since its broken now.

Boot tested on ARM platform and just build tested for x86 platform.
Technically it should not break anything but cc list check if
it creates any issue for your arch.

Cc: Vineet Gupta 
Cc: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Mark Salter 
Cc: Aurelien Jacquiot 
Cc: James Hogan 
Cc: Michal Simek 
Cc: Ralf Baechle 
Cc: Jonas Bonn 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: x...@kernel.org
Cc: a...@kernel.org
Cc: Chris Zankel 
Cc: Max Filippov 
Cc: Grant Likely 
Cc: Rob Herring 
Cc: bige...@linutronix.de
Cc: robherri...@gmail.com
Cc: Nicolas Pitre 

Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-c6x-...@linux-c6x.org
Cc: linux-m...@linux-mips.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-xte...@linux-xtensa.org
Cc: devicetree-disc...@lists.ozlabs.org

Signed-off-by: Santosh Shilimkar 
---
 arch/arc/mm/init.c|3 +--
 arch/arm/mm/init.c|2 +-
 arch/arm64/mm/init.c  |3 +--
 arch/c6x/kernel/devicetree.c  |3 +--
 arch/metag/mm/init.c  |3 +--
 arch/microblaze/kernel/prom.c |3 +--
 arch/mips/kernel/prom.c   |3 +--
 arch/openrisc/kernel/prom.c   |3 +--
 arch/powerpc/kernel/prom.c|3 +--
 arch/x86/kernel/devicetree.c  |3 +--
 arch/xtensa/kernel/setup.c|3 +--
 drivers/of/fdt.c  |   10 ++
 include/linux/of_fdt.h|3 +--
 13 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 4a17736..3640c74 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -157,8 +157,7 @@ void __init free_initrd_mem(unsigned long start, unsigned 
long end)
 #endif
 
 #ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-   unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
 {
pr_err("%s(%lx, %lx)\n", __func__, start, end);
 }
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 9a5cdc0..afeaef7 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -76,7 +76,7 @@ static int __init parse_tag_initrd2(const struct tag *tag)
 __tagtable(ATAG_INITRD2, parse_tag_initrd2);
 
 #ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start, unsigned long 
end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
 {
phys_initrd_start = start;
phys_initrd_size = end - start;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f497ca7..7047708 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -44,8 +44,7 @@ static unsigned long phys_initrd_size __initdata = 0;
 
 phys_addr_t memstart_addr __read_mostly = 0;
 
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-   unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
 {
phys_initrd_start = start;
phys_initrd_size = end - start;
diff --git a/arch/c6x/kernel/devicetree.c b/arch/c6x/kernel/devicetree.c
index bdb56f0..287d0e6 100644
--- a/arch/c6x/kernel/devicetree.c
+++ b/arch/c6x/kernel/devicetree.c
@@ -33,8 +33,7 @@ void __init early_init_devtree(void *params)
 
 
 #ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-   unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
 {
initrd_start = (unsigned long)__va(start);
initrd_end = (unsigned long)__va(end);
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index d05b845..07b4412 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -419,8 +419,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_OF_FLATTREE
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-   unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
 {
pr_err("%s(%lx, %lx)\n",
   __func__, start, end);
diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c
index 0a2c68f..62e2e8f 100644
--- a/arch/microblaze/kernel/prom.c
+++ b/arch/microblaze/kernel/prom.c
@@ -136,8 +136,7 @@ void __init early_init_devtree(void *params)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(unsigned long start,
-   unsigned long end)
+void __init early_init_dt_setup_initrd_arch(u64 start,

Re: [PATCH] of: Specify initrd location using 64-bit

2013-06-20 Thread Vineet Gupta
Hi Santosh,

On 06/21/2013 06:22 AM, Santosh Shilimkar wrote:
> Cc: Vineet Gupta 
> Cc: Russell King 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Mark Salter 
> Cc: Aurelien Jacquiot 
> Cc: James Hogan 
> Cc: Michal Simek 
> Cc: Ralf Baechle 
> Cc: Jonas Bonn 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: x...@kernel.org
> Cc: a...@kernel.org
> Cc: Chris Zankel 
> Cc: Max Filippov 
> Cc: Grant Likely 
> Cc: Rob Herring 
> Cc: bige...@linutronix.de
> Cc: robherri...@gmail.com
> Cc: Nicolas Pitre 
>
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-c6x-...@linux-c6x.org
> Cc: linux-m...@linux-mips.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-xte...@linux-xtensa.org
> Cc: devicetree-disc...@lists.ozlabs.org
>
> Signed-off-by: Santosh Shilimkar 
> ---
>  arch/arc/mm/init.c|3 +--
>  arch/arm/mm/init.c|2 +-
>  arch/arm64/mm/init.c  |3 +--
>  arch/c6x/kernel/devicetree.c  |3 +--
>  arch/metag/mm/init.c  |3 +--
>  arch/microblaze/kernel/prom.c |3 +--
>  arch/mips/kernel/prom.c   |3 +--
>  arch/openrisc/kernel/prom.c   |3 +--
>  arch/powerpc/kernel/prom.c|3 +--
>  arch/x86/kernel/devicetree.c  |3 +--
>  arch/xtensa/kernel/setup.c|3 +--
>  drivers/of/fdt.c  |   10 ++
>  include/linux/of_fdt.h|3 +--
>  13 files changed, 18 insertions(+), 27 deletions(-)
>
> diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
> index 4a17736..3640c74 100644
> --- a/arch/arc/mm/init.c
> +++ b/arch/arc/mm/init.c
> @@ -157,8 +157,7 @@ void __init free_initrd_mem(unsigned long start, unsigned 
> long end)
>  #endif
>  
>  #ifdef CONFIG_OF_FLATTREE
> -void __init early_init_dt_setup_initrd_arch(unsigned long start,
> - unsigned long end)
> +void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
>  {
>   pr_err("%s(%lx, %lx)\n", __func__, start, end);
>  }

To avoid gcc warnings, you need to fix the print format specifiers too.

Thx,
-Vineet
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev