[PATCH v2 2/6] powerpc/perf: Add PM_INST_DISP event to Power9 event list

2017-02-12 Thread Madhavan Srinivasan
Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
Fix the event code.

 arch/powerpc/perf/power9-events-list.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/perf/power9-events-list.h 
b/arch/powerpc/perf/power9-events-list.h
index 929b56d47ad9..71a6bfee5c02 100644
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -53,3 +53,6 @@ EVENT(PM_ITLB_MISS,   0x400fc)
 EVENT(PM_RUN_INST_CMPL,0x500fa)
 /* Run_cycles */
 EVENT(PM_RUN_CYC,  0x600f4)
+/* Instruction Dispatched */
+EVENT(PM_INST_DISP,0x200f2)
+EVENT(PM_INST_DISP_ALT,0x300f2)
-- 
2.7.4



[PATCH v2 1/6] powerpc/perf: Factor of event_alternative function

2017-02-12 Thread Madhavan Srinivasan
Factor out the power8 event_alternative function to share
the code with power9.

Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
No changes to this patch, just a rebase

 arch/powerpc/perf/isa207-common.c | 36 
 arch/powerpc/perf/isa207-common.h |  3 +++
 arch/powerpc/perf/power8-pmu.c| 35 ++-
 3 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/perf/isa207-common.c 
b/arch/powerpc/perf/isa207-common.c
index 50e598cf644b..a86fadee352b 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -338,3 +338,39 @@ void isa207_disable_pmc(unsigned int pmc, unsigned long 
mmcr[])
if (pmc <= 3)
mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
 }
+
+static int find_alternative(u64 event, const unsigned int ev_alt[][MAX_ALT], 
int size)
+{
+   int i, j;
+
+   for (i = 0; i < size; ++i) {
+   if (event < ev_alt[i][0])
+   break;
+
+   for (j = 0; j < MAX_ALT && ev_alt[i][j]; ++j)
+   if (event == ev_alt[i][j])
+   return i;
+   }
+
+   return -1;
+}
+
+int isa207_get_alternatives(u64 event, u64 alt[],
+   const unsigned int ev_alt[][MAX_ALT], int size)
+{
+   int i, j, num_alt = 0;
+   u64 alt_event;
+
+   alt[num_alt++] = event;
+   i = find_alternative(event, ev_alt, size);
+   if (i >= 0) {
+   /* Filter out the original event, it's already in alt[0] */
+   for (j = 0; j < MAX_ALT; ++j) {
+   alt_event = ev_alt[i][j];
+   if (alt_event && alt_event != event)
+   alt[num_alt++] = alt_event;
+   }
+   }
+
+   return num_alt;
+}
diff --git a/arch/powerpc/perf/isa207-common.h 
b/arch/powerpc/perf/isa207-common.h
index 90495f1580c7..3e9150f6690a 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -260,5 +260,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
unsigned int hwc[], unsigned long mmcr[],
struct perf_event *pevents[]);
 void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
+int isa207_get_alternatives(u64 event, u64 alt[],
+   const unsigned int ev_alt[][MAX_ALT], int size);
+
 
 #endif
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index d07186382f3a..ce15b19a7962 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -48,43 +48,12 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
{ PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL },
 };
 
-/*
- * Scan the alternatives table for a match and return the
- * index into the alternatives table if found, else -1.
- */
-static int find_alternative(u64 event)
-{
-   int i, j;
-
-   for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
-   if (event < event_alternatives[i][0])
-   break;
-
-   for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
-   if (event == event_alternatives[i][j])
-   return i;
-   }
-
-   return -1;
-}
-
 static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
int i, j, num_alt = 0;
-   u64 alt_event;
-
-   alt[num_alt++] = event;
-
-   i = find_alternative(event);
-   if (i >= 0) {
-   /* Filter out the original event, it's already in alt[0] */
-   for (j = 0; j < MAX_ALT; ++j) {
-   alt_event = event_alternatives[i][j];
-   if (alt_event && alt_event != event)
-   alt[num_alt++] = alt_event;
-   }
-   }
 
+   num_alt = isa207_get_alternatives(event, alt, event_alternatives,
+   (int)ARRAY_SIZE(event_alternatives));
if (flags & PPMU_ONLY_COUNT_RUN) {
/*
 * We're only counting in RUN state, so PM_CYC is equivalent to
-- 
2.7.4



[PATCH v2 3/6] powerpc/perf: Add alternative event table and function for power9

2017-02-12 Thread Madhavan Srinivasan
Signed-off-by: Madhavan Srinivasan 
---
Change v1:
No changes, just a rebase

 arch/powerpc/perf/power9-pmu.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7332634e18c9..b38acff8a791 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -106,6 +106,21 @@ enum {
 /* PowerISA v2.07 format attribute structure*/
 extern struct attribute_group isa207_pmu_format_group;
 
+/* Table of alternatives, sorted by column 0 */
+static const unsigned int power9_event_alternatives[][MAX_ALT] = {
+   { PM_INST_DISP, PM_INST_DISP_ALT },
+};
+
+static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+   int num_alt = 0;
+
+   num_alt = isa207_get_alternatives(event, alt, power9_event_alternatives,
+   (int)ARRAY_SIZE(power9_event_alternatives));
+
+   return num_alt;
+}
+
 GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-frontend,PM_ICT_NOSLOT_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL);
@@ -383,6 +398,7 @@ static struct power_pmu power9_isa207_pmu = {
.config_bhrb= power9_config_bhrb,
.bhrb_filter_map= power9_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
+   .get_alternatives   = power9_get_alternatives,
.disable_pmc= isa207_disable_pmc,
.flags  = PPMU_NO_SIAR | PPMU_ARCH_207S,
.n_generic  = ARRAY_SIZE(power9_generic_events),
@@ -401,6 +417,7 @@ static struct power_pmu power9_pmu = {
.config_bhrb= power9_config_bhrb,
.bhrb_filter_map= power9_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
+   .get_alternatives   = power9_get_alternatives,
.disable_pmc= isa207_disable_pmc,
.flags  = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic  = ARRAY_SIZE(power9_generic_events),
-- 
2.7.4



[PATCH v2 4/6] powerpc/perf: Use PM_INST_DISP for generic instructions sample

2017-02-12 Thread Madhavan Srinivasan
Since PM_INST_CMPL may not provide right counts in all
sampling scenarios in power9 DD1, instead use PM_INST_DISP.
Patch also update generic instruction sampling with the same.

Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
Based on DD1 check, modified the event code to use for "instructions"

 arch/powerpc/perf/power9-pmu.c | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index b38acff8a791..454e9f70894f 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -228,6 +228,17 @@ static const struct attribute_group 
*power9_pmu_attr_groups[] = {
NULL,
 };
 
+static int power9_generic_events_dd1[] = {
+   [PERF_COUNT_HW_CPU_CYCLES] =PM_CYC,
+   [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =   PM_ICT_NOSLOT_CYC,
+   [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =PM_CMPLU_STALL,
+   [PERF_COUNT_HW_INSTRUCTIONS] =  PM_INST_DISP,
+   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =   PM_BRU_CMPL,
+   [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL,
+   [PERF_COUNT_HW_CACHE_REFERENCES] =  PM_LD_REF_L1,
+   [PERF_COUNT_HW_CACHE_MISSES] =  PM_LD_MISS_L1_FIN,
+};
+
 static int power9_generic_events[] = {
[PERF_COUNT_HW_CPU_CYCLES] =PM_CYC,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =   PM_ICT_NOSLOT_CYC,
@@ -401,8 +412,8 @@ static struct power_pmu power9_isa207_pmu = {
.get_alternatives   = power9_get_alternatives,
.disable_pmc= isa207_disable_pmc,
.flags  = PPMU_NO_SIAR | PPMU_ARCH_207S,
-   .n_generic  = ARRAY_SIZE(power9_generic_events),
-   .generic_events = power9_generic_events,
+   .n_generic  = ARRAY_SIZE(power9_generic_events_dd1),
+   .generic_events = power9_generic_events_dd1,
.cache_events   = &power9_cache_events,
.attr_groups= power9_isa207_pmu_attr_groups,
.bhrb_nr= 32,
@@ -437,6 +448,11 @@ static int __init init_power9_pmu(void)
return -ENODEV;
 
if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+   /*
+* Since PM_INST_CMPL may not provide right counts in all
+* sampling scenarios in power9 DD1, instead use PM_INST_DISP.
+*/
+   EVENT_VAR(PM_INST_CMPL, _g).id = PM_INST_DISP;
rc = register_power_pmu(&power9_isa207_pmu);
} else {
rc = register_power_pmu(&power9_pmu);
-- 
2.7.4



[PATCH v2 5/6] powerpc/perf: Use Instruction Counter value

2017-02-12 Thread Madhavan Srinivasan
Since PM_INST_DISP include speculative instruction,
based on the workload the dispatch count could vary
considerably. Hence as an alternative, for completed
instruction counting, program the PM_INST_DISP event
to the MMCR* but use Instruction Counter register value.

Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
1)Removed the #ifdef and added changes for EBB count updates
 
 arch/powerpc/perf/core-book3s.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 270eb9b74e2e..87d17a1f7168 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -57,6 +57,7 @@ struct cpu_hw_events {
void*bhrb_context;
struct  perf_branch_stack   bhrb_stack;
struct  perf_branch_entry   bhrb_entries[BHRB_MAX_ENTRIES];
+   u64 ic_init;
 };
 
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
@@ -127,6 +128,10 @@ static inline void power_pmu_bhrb_disable(struct 
perf_event *event) {}
 static void power_pmu_sched_task(struct perf_event_context *ctx, bool 
sched_in) {}
 static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
 static void pmao_restore_workaround(bool ebb) { }
+static bool use_ic(u64 event)
+{
+   return false;
+}
 #endif /* CONFIG_PPC32 */
 
 static bool regs_use_siar(struct pt_regs *regs)
@@ -688,6 +693,15 @@ static void pmao_restore_workaround(bool ebb)
mtspr(SPRN_PMC5, pmcs[4]);
mtspr(SPRN_PMC6, pmcs[5]);
 }
+
+static bool use_ic(u64 event)
+{
+   if (cpu_has_feature(CPU_FTR_POWER9_DD1) &&
+   (event == 0x200f2 || event == 0x300f2))
+   return true;
+
+   return false;
+}
 #endif /* CONFIG_PPC64 */
 
 static void perf_event_interrupt(struct pt_regs *regs);
@@ -1007,6 +1021,7 @@ static u64 check_and_compute_delta(u64 prev, u64 val)
 static void power_pmu_read(struct perf_event *event)
 {
s64 val, delta, prev;
+   struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
 
if (event->hw.state & PERF_HES_STOPPED)
return;
@@ -1016,6 +1031,13 @@ static void power_pmu_read(struct perf_event *event)
 
if (is_ebb_event(event)) {
val = read_pmc(event->hw.idx);
+   if (use_ic(event->attr.config)) {
+   val = mfspr(SPRN_IC);
+   if (val > cpuhw->ic_init)
+   val = val - cpuhw->ic_init;
+   else
+   val = val + (0 - cpuhw->ic_init);
+   }
local64_set(&event->hw.prev_count, val);
return;
}
@@ -1029,6 +1051,13 @@ static void power_pmu_read(struct perf_event *event)
prev = local64_read(&event->hw.prev_count);
barrier();
val = read_pmc(event->hw.idx);
+   if (use_ic(event->attr.config)) {
+   val = mfspr(SPRN_IC);
+   if (val > cpuhw->ic_init)
+   val = val - cpuhw->ic_init;
+   else
+   val = val + (0 - cpuhw->ic_init);
+   }
delta = check_and_compute_delta(prev, val);
if (!delta)
return;
@@ -1466,6 +1495,13 @@ static int power_pmu_add(struct perf_event *event, int 
ef_flags)
event->attr.branch_sample_type);
}
 
+   /*
+* Workaround for POWER9 DD1 to use the Instruction Counter
+* register value for instruction counting
+*/
+   if (use_ic(event->attr.config))
+   cpuhw->ic_init = mfspr(SPRN_IC);
+
perf_pmu_enable(event->pmu);
local_irq_restore(flags);
return ret;
-- 
2.7.4



[PATCH v2 6/6] powerpc/perf: Add restrictions to PMC5 in power9 DD1

2017-02-12 Thread Madhavan Srinivasan
PMC5 on POWER9 DD1 may not provide right counts in all
sampling scenarios, hence use PM_INST_DISP event instead
in PMC2 or PMC3 in preference.

Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
No changes, just a rebase

 arch/powerpc/perf/isa207-common.h | 4 
 arch/powerpc/perf/power9-pmu.c| 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/isa207-common.h 
b/arch/powerpc/perf/isa207-common.h
index 3e9150f6690a..cf9bd8990159 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -222,6 +222,10 @@
CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \
CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL
 
+/*
+ * Lets restrict use of PMC5 for instruction counting.
+ */
+#define P9_DD1_TEST_ADDER  (ISA207_TEST_ADDER | CNST_PMC_VAL(5))
 
 /* Bits in MMCR1 for PowerISA v2.07 */
 #define MMCR1_UNIT_SHIFT(pmc)  (60 - (4 * ((pmc) - 1)))
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 454e9f70894f..5fe9cb1dc3b6 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -423,7 +423,7 @@ static struct power_pmu power9_pmu = {
.name   = "POWER9",
.n_counter  = MAX_PMU_COUNTERS,
.add_fields = ISA207_ADD_FIELDS,
-   .test_adder = ISA207_TEST_ADDER,
+   .test_adder = P9_DD1_TEST_ADDER,
.compute_mmcr   = isa207_compute_mmcr,
.config_bhrb= power9_config_bhrb,
.bhrb_filter_map= power9_bhrb_filter_map,
-- 
2.7.4



[PATCH v2] powerpc/perf: Add constraints for power9 l2/l3 bus events

2017-02-12 Thread Madhavan Srinivasan
In Power9, L2/L3 bus events are always available as a
"bank" of 4 events. To obtain the counts for any of the
l2/l3 bus events in a given bank, the user will have to
program PMC4 with corresponding l2/l3 bus event for that
bank. Patch add a mask and a new pass to updates the mask
for each PMU used by L2/L3 bus events and checks the mask
to enforce it.

Signed-off-by: Madhavan Srinivasan 
---
Changelog v1:
Removed the callback and added a new pass

 arch/powerpc/perf/isa207-common.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/isa207-common.c 
b/arch/powerpc/perf/isa207-common.c
index 50e598cf644b..ce12e93dbd16 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -219,7 +219,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
   struct perf_event *pevents[])
 {
unsigned long mmcra, mmcr1, mmcr2, unit, combine, psel, cache, val;
-   unsigned int pmc, pmc_inuse;
+   unsigned int pmc, pmc_inuse, mask=0;
int i;
 
pmc_inuse = 0;
@@ -312,6 +312,24 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
hwc[i] = pmc - 1;
}
 
+/*
+ * Pass 3: to Check for l2/l3 bus event rule. PMC4
+ * must be programmed to use L2/L3 bus events in any other PMC[1/2/3]s
+ */
+   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+   for (i = 0; i < n_ev; ++i) {
+   pmc = (event[i] >> EVENT_PMC_SHIFT) & 
EVENT_PMC_MASK;
+   unit= (event[i] >> EVENT_UNIT_SHIFT) & 
EVENT_UNIT_MASK;
+   if (unit >= 6 && unit <= 9)
+   mask |= 1 << (pmc - 1);
+   }
+
+   if ((mask) && ((mask & 0xf) < 0x8)) {
+   printk(KERN_ERR "Missing PMC4 L2/L3 Bus event\n");
+   return -1;
+   }
+   }
+
/* Return MMCRx values */
mmcr[0] = 0;
 
-- 
2.7.4



Re: [PATCH] powerpc: Blacklist GCC 5.4 6.1 and 6.2

2017-02-12 Thread Cyril Bur
On Fri, 2017-02-10 at 08:48 +0100, Christophe LEROY wrote:
> 
> Le 10/02/2017 à 06:31, Cyril Bur a écrit :
> > A bug in the -02 optimisation of GCC 5.4 6.1 and 6.2 causes
> > setup_command_line() to not pass the correct first argument to strcpy
> > and therefore not actually copy the command_line.
> > 
> > A workaround patch was proposed: http://patchwork.ozlabs.org/patch/673130/
> > some discussion ensued.
> > 
> > A GCC bug was raised: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709
> > The bug has been fixed in 7.0 and backported to GCC 5 and GCC 6.
> > 
> > At the time of writing GCC 5.4 is the most recent and is affected. GCC
> > 6.3 contains the backported fix, has been tested and appears safe to
> > use.
> > 
> > Heavy-lifting-by: Akshay Adiga 
> > Signed-off-by: Cyril Bur 
> > ---
> >  arch/powerpc/Makefile | 9 +
> >  1 file changed, 9 insertions(+)
> > 
> > diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> > index 31286fa7873c..a4b886694391 100644
> > --- a/arch/powerpc/Makefile
> > +++ b/arch/powerpc/Makefile
> > @@ -414,6 +414,15 @@ checkbin:
> > echo -n '*** Please use a different binutils version.' ; \
> > false ; \
> > fi
> > +   @if test "$(cc-version)" = "0504" \
> > +   || test "$(cc-version)" = "0601" \
> > +   || test "$(cc-version)" = "0602" ; then \
> > +   echo -n '*** GCC-5.4 6.1 6.2 have a bad -O2 optimisation ' ; \
> > +   echo 'which will cause lost commandline options (at least).' ; \
> > +   echo '*** Please use a different GCC version.' ; \
> > +   false ; \
> > +   fi
> > +
> 
> According to the GCC bug report, this bug applies to powerpc64le
> Why force all targets to not use those versions of GCC ?
> 

True, I'll double check on BE, these days BE does suffer from, "oops I
didn't check that". If it is only LE, I'll add another condition.

Thanks,

Cyril

> Christophe


Re: [PATCH] powernv/opal: Handle OPAL_WRONG_STATE error from OPAL fails

2017-02-12 Thread Michael Ellerman
Vipin K Parashar  writes:

> OPAL returns OPAL_WRONG_STATE for XSCOM operations
>
> done to read any core FIR which is sleeping, offline.

OK.

Do we know why Linux is causing that to happen?

It's also returned from many of the XIVE routines if we're in the wrong
xive mode, all of which would indicate a fairly bad Linux bug.

Also the skiboot patch which added WRONG_STATE for XSCOM ops did so
explicitly so we could differentiate from other errors:

commit 9c2d82394fd2303847cac4a665dee62556ca528a
Author: Russell Currey 
AuthorDate: Mon Mar 21 12:00:00 2016 +1100

xscom: Return OPAL_WRONG_STATE on XSCOM ops if CPU is asleep

xscom_read and xscom_write return OPAL_SUCCESS if they worked, and
OPAL_HARDWARE if they didn't.  This doesn't provide information about why
the operation failed, such as if the CPU happens to be asleep.

This is specifically useful in error scanning, so if every CPU is being
scanned for errors, sleeping CPUs likely aren't the cause of failures.

So, return OPAL_WRONG_STATE in xscom_read and xscom_write if the CPU is
sleeping.

Signed-off-by: Russell Currey 
Reviewed-by: Alistair Popple 
Signed-off-by: Stewart Smith 



So I'm still not convinced that quietly swallowing this error and
mapping it to -EIO along with several of the other error codes is the
right thing to do.

cheers


[PATCH v2] powerpc: Blacklist GCC 5.4 6.1 and 6.2

2017-02-12 Thread Cyril Bur
A bug in the -02 optimisation of GCC 5.4 6.1 and 6.2 causes
setup_command_line() to not pass the correct first argument to strcpy
and therefore not actually copy the command_line.

A workaround patch was proposed: http://patchwork.ozlabs.org/patch/673130/
some discussion ensued.

A GCC bug was raised: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709
The bug has been fixed in 7.0 and backported to GCC 5 and GCC 6.

At the time of writing GCC 5.4 is the most recent and is affected. GCC
6.3 contains the backported fix, has been tested and appears safe to
use.

Heavy-lifting-by: Akshay Adiga 
Signed-off-by: Cyril Bur 
---
v2: Added check to only blacklist compilers on little-endian

 arch/powerpc/Makefile | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 31286fa7873c..db5d8dabf1ca 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -381,6 +381,7 @@ TOUT:= .tmp_gas_check
 # - gcc-3.4 and binutils-2.14 are a fatal combination
 # - Require gcc 4.0 or above on 64-bit
 # - gcc-4.2.0 has issues compiling modules on 64-bit
+# - gcc-5.4, 6.1, 6.2 don't copy the command_line around correctly
 checkbin:
@if test "$(cc-name)" != "clang" \
&& test "$(cc-version)" = "0304" ; then \
@@ -414,6 +415,16 @@ checkbin:
echo -n '*** Please use a different binutils version.' ; \
false ; \
fi
+   @if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \
+   && { test "$(cc-version)" = "0504" \
+   || test "$(cc-version)" = "0601" \
+   || test "$(cc-version)" = "0602" ; } ; then \
+   echo -n '*** GCC-5.4 6.1 6.2 have a bad -O2 optimisation ' ; \
+   echo 'which will cause lost command_line options (at least).' ; 
\
+   echo '*** Please use a different GCC version.' ; \
+   false ; \
+   fi
+
 
 
 CLEAN_FILES += $(TOUT)
-- 
2.11.1



[PATCH V2 0/7] powerpc/mm/ppc64: Add 128TB support

2017-02-12 Thread Aneesh Kumar K.V
This patch series increase the effective virtual address range of
applications from 64TB to 128TB. We do that by supporting a 
68 bit virtual address. On platforms that can only do 65 bit virtual
address we limit the max contexts to a 16bit value instead of 19.

The patch series also switch the page table layout such that we can
do 512TB effective address. But we still limit the TASK_SIZE to
128TB. This was done to make sure we don't break applications
that make assumption regarding the max address returned by the
OS. We can switch to 128TB without a linux personality value because
other architectures do 128TB as max address.

Aneesh Kumar K.V (7):
  powerpc/mm/slice: Convert slice_mask high slice to a bitmap
  powerpc/mm/slice: Update the function prototype
  powerpc/mm/hash: Move kernel context to the starting of context range
  powerpc/mm/hash: Support 68 bit VA
  powerpc/mm: Move copy_mm_to_paca to paca.c
  powerpc/mm: Remove redundant TASK_SIZE_USER64 checks
  powerpc/mm/hash: Increase VA range to 128TB

 arch/powerpc/include/asm/book3s/64/hash-4k.h  |   2 +-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |   2 +-
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 160 +++---
 arch/powerpc/include/asm/mmu.h|  19 ++-
 arch/powerpc/include/asm/mmu_context.h|   2 +-
 arch/powerpc/include/asm/paca.h   |  18 +--
 arch/powerpc/include/asm/page_64.h|  15 +--
 arch/powerpc/include/asm/processor.h  |  22 +++-
 arch/powerpc/kernel/paca.c|  26 +
 arch/powerpc/kvm/book3s_64_mmu_host.c |  10 +-
 arch/powerpc/mm/hash_utils_64.c   |   9 +-
 arch/powerpc/mm/init_64.c |   4 -
 arch/powerpc/mm/mmu_context_book3s64.c|  96 +++-
 arch/powerpc/mm/pgtable_64.c  |   5 -
 arch/powerpc/mm/slb.c |   2 +-
 arch/powerpc/mm/slb_low.S |  74 
 arch/powerpc/mm/slice.c   | 149 ++--
 17 files changed, 379 insertions(+), 236 deletions(-)

-- 
2.7.4



[PATCH V2 1/7] powerpc/mm/slice: Convert slice_mask high slice to a bitmap

2017-02-12 Thread Aneesh Kumar K.V
In followup patch we want to increase the va range which will result
in us requiring high_slices to have more than 64 bits. To enable this
convert high_slices to bitmap. We keep the number bits same in this patch
and later change that to higher value

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/page_64.h |  15 +++---
 arch/powerpc/mm/slice.c| 106 -
 2 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/page_64.h 
b/arch/powerpc/include/asm/page_64.h
index dd5f0712afa2..7f72659b7999 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -98,19 +98,16 @@ extern u64 ppc64_pft_size;
 #define GET_LOW_SLICE_INDEX(addr)  ((addr) >> SLICE_LOW_SHIFT)
 #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
 
+#ifndef __ASSEMBLY__
 /*
- * 1 bit per slice and we have one slice per 1TB
- * Right now we support only 64TB.
- * IF we change this we will have to change the type
- * of high_slices
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ * 64 below is actually SLICE_NUM_HIGH to fixup complie errros
  */
-#define SLICE_MASK_SIZE 8
-
-#ifndef __ASSEMBLY__
-
 struct slice_mask {
u16 low_slices;
-   u64 high_slices;
+   DECLARE_BITMAP(high_slices, 64);
 };
 
 struct mm_struct;
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 2b27458902ee..84dfb2b58870 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -36,11 +36,6 @@
 #include 
 #include 
 
-/* some sanity checks */
-#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
-#endif
-
 static DEFINE_SPINLOCK(slice_convert_lock);
 
 
@@ -49,7 +44,7 @@ int _slice_debug = 1;
 
 static void slice_print_mask(const char *label, struct slice_mask mask)
 {
-   char*p, buf[16 + 3 + 64 + 1];
+   char*p, buf[SLICE_NUM_LOW + 3 + SLICE_NUM_HIGH + 1];
int i;
 
if (!_slice_debug)
@@ -60,8 +55,12 @@ static void slice_print_mask(const char *label, struct 
slice_mask mask)
*(p++) = ' ';
*(p++) = '-';
*(p++) = ' ';
-   for (i = 0; i < SLICE_NUM_HIGH; i++)
-   *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
+   for (i = 0; i < SLICE_NUM_HIGH; i++) {
+   if (test_bit(i, mask.high_slices))
+   *(p++) = '1';
+   else
+   *(p++) = '0';
+   }
*(p++) = 0;
 
printk(KERN_DEBUG "%s:%s\n", label, buf);
@@ -80,7 +79,10 @@ static struct slice_mask slice_range_to_mask(unsigned long 
start,
 unsigned long len)
 {
unsigned long end = start + len - 1;
-   struct slice_mask ret = { 0, 0 };
+   struct slice_mask ret;
+
+   ret.low_slices = 0;
+   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
 
if (start < SLICE_LOW_TOP) {
unsigned long mend = min(end, SLICE_LOW_TOP);
@@ -91,9 +93,8 @@ static struct slice_mask slice_range_to_mask(unsigned long 
start,
}
 
if ((start + len) > SLICE_LOW_TOP)
-   ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
-   - (1ul << GET_HIGH_SLICE_INDEX(start));
-
+   bitmap_set(ret.high_slices, GET_HIGH_SLICE_INDEX(start),
+  GET_HIGH_SLICE_INDEX(len));
return ret;
 }
 
@@ -130,9 +131,12 @@ static int slice_high_has_vma(struct mm_struct *mm, 
unsigned long slice)
 
 static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
 {
-   struct slice_mask ret = { 0, 0 };
+   struct slice_mask ret;
unsigned long i;
 
+   ret.low_slices = 0;
+   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+
for (i = 0; i < SLICE_NUM_LOW; i++)
if (!slice_low_has_vma(mm, i))
ret.low_slices |= 1u << i;
@@ -142,7 +146,7 @@ static struct slice_mask slice_mask_for_free(struct 
mm_struct *mm)
 
for (i = 0; i < SLICE_NUM_HIGH; i++)
if (!slice_high_has_vma(mm, i))
-   ret.high_slices |= 1ul << i;
+   __set_bit(i, ret.high_slices);
 
return ret;
 }
@@ -151,10 +155,13 @@ static struct slice_mask slice_mask_for_size(struct 
mm_struct *mm, int psize)
 {
unsigned char *hpsizes;
int index, mask_index;
-   struct slice_mask ret = { 0, 0 };
+   struct slice_mask ret;
unsigned long i;
u64 lpsizes;
 
+   ret.low_slices = 0;
+   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (((lpsizes >> (i * 4)) & 0xf) == psize)
@@ -165,7 +172,7 @@ static struct slice_mask slice_mask_for_size(struct 
mm_struct 

[PATCH V2 2/7] powerpc/mm/slice: Update the function prototype

2017-02-12 Thread Aneesh Kumar K.V
This avoid copying the slice_mask struct as function return value

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/slice.c | 63 +++--
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 84dfb2b58870..0ec750adbe7f 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -75,27 +75,26 @@ static void slice_print_mask(const char *label, struct 
slice_mask mask) {}
 
 #endif
 
-static struct slice_mask slice_range_to_mask(unsigned long start,
-unsigned long len)
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+   struct slice_mask *ret)
 {
unsigned long end = start + len - 1;
-   struct slice_mask ret;
 
-   ret.low_slices = 0;
-   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+   ret->low_slices = 0;
+   bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
if (start < SLICE_LOW_TOP) {
unsigned long mend = min(end, SLICE_LOW_TOP);
unsigned long mstart = min(start, SLICE_LOW_TOP);
 
-   ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+   ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
- (1u << GET_LOW_SLICE_INDEX(mstart));
}
 
if ((start + len) > SLICE_LOW_TOP)
-   bitmap_set(ret.high_slices, GET_HIGH_SLICE_INDEX(start),
+   bitmap_set(ret->high_slices, GET_HIGH_SLICE_INDEX(start),
   GET_HIGH_SLICE_INDEX(len));
-   return ret;
+   return;
 }
 
 static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
@@ -129,53 +128,47 @@ static int slice_high_has_vma(struct mm_struct *mm, 
unsigned long slice)
return !slice_area_is_free(mm, start, end - start);
 }
 
-static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 {
-   struct slice_mask ret;
unsigned long i;
 
-   ret.low_slices = 0;
-   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+   ret->low_slices = 0;
+   bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
for (i = 0; i < SLICE_NUM_LOW; i++)
if (!slice_low_has_vma(mm, i))
-   ret.low_slices |= 1u << i;
+   ret->low_slices |= 1u << i;
 
if (mm->task_size <= SLICE_LOW_TOP)
-   return ret;
+   return;
 
for (i = 0; i < SLICE_NUM_HIGH; i++)
if (!slice_high_has_vma(mm, i))
-   __set_bit(i, ret.high_slices);
-
-   return ret;
+   __set_bit(i, ret->high_slices);
 }
 
-static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+static void slice_mask_for_size(struct mm_struct *mm, int psize, struct 
slice_mask *ret)
 {
unsigned char *hpsizes;
int index, mask_index;
-   struct slice_mask ret;
unsigned long i;
u64 lpsizes;
 
-   ret.low_slices = 0;
-   bitmap_zero(ret.high_slices, SLICE_NUM_HIGH);
+   ret->low_slices = 0;
+   bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
 
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (((lpsizes >> (i * 4)) & 0xf) == psize)
-   ret.low_slices |= 1u << i;
+   ret->low_slices |= 1u << i;
 
hpsizes = mm->context.high_slices_psize;
for (i = 0; i < SLICE_NUM_HIGH; i++) {
mask_index = i & 0x1;
index = i >> 1;
if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
-   __set_bit(i, ret.high_slices);
+   __set_bit(i, ret->high_slices);
}
-
-   return ret;
 }
 
 static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
@@ -457,7 +450,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
/* First make up a "good" mask of slices that have the right size
 * already
 */
-   good_mask = slice_mask_for_size(mm, psize);
+   slice_mask_for_size(mm, psize, &good_mask);
slice_print_mask(" good_mask", good_mask);
 
/*
@@ -482,7 +475,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
 #ifdef CONFIG_PPC_64K_PAGES
/* If we support combo pages, we can allow 64k pages in 4k slices */
if (psize == MMU_PAGE_64K) {
-   compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+   slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
if (fixed)
slice_or_mask(&good_mask, &compat_mask);
}
@@ -491,7 +484,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
/* First

[PATCH V2 3/7] powerpc/mm/hash: Move kernel context to the starting of context range

2017-02-12 Thread Aneesh Kumar K.V
With current kernel, we use the top 4 context for the kernel. Kernel VSIDs are 
built
using these top context values and effective segemnt ID. In the following 
patches,
we want to increase the max effective address to 512TB. We achieve that by
increasing the effective segments IDs there by increasing virtual address range.

We will be switching to a 68bit virtual address in the following patch. But for
platforms like  p4 and p5, which only support a 65 bit va, we want to limit the
virtual addrress to a 65 bit value. We do that by limiting the context bits to 
16
instead of 19. That means we will have different max context values on different
platforms.

To make this simpler. we move the kernel context to the starting of the range.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 39 ++--
 arch/powerpc/include/asm/mmu_context.h|  2 +-
 arch/powerpc/kvm/book3s_64_mmu_host.c |  2 +-
 arch/powerpc/mm/hash_utils_64.c   |  5 --
 arch/powerpc/mm/mmu_context_book3s64.c| 88 ++-
 arch/powerpc/mm/slb_low.S | 20 ++
 6 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 8720a406bbbe..0a86cb2022d2 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -496,10 +496,10 @@ extern void slb_set_size(u16 size);
  * For user processes max context id is limited to ((1ul << 19) - 5)
  * for kernel space, we use the top 4 context ids to map address as below
  * NOTE: each context only support 64TB now.
- * 0x7fffc -  [ 0xc000 - 0xc0003fff ]
- * 0x7fffd -  [ 0xd000 - 0xd0003fff ]
- * 0x7fffe -  [ 0xe000 - 0xe0003fff ]
- * 0x7 -  [ 0xf000 - 0xf0003fff ]
+ * 0x0 -  [ 0xc000 - 0xc0003fff ]
+ * 0x1 -  [ 0xd000 - 0xd0003fff ]
+ * 0x2 -  [ 0xe000 - 0xe0003fff ]
+ * 0x3 -  [ 0xf000 - 0xf0003fff ]
  *
  * The proto-VSIDs are then scrambled into real VSIDs with the
  * multiplicative hash:
@@ -513,15 +513,9 @@ extern void slb_set_size(u16 size);
  * robust scattering in the hash table (at least based on some initial
  * results).
  *
- * We also consider VSID 0 special. We use VSID 0 for slb entries mapping
- * bad address. This enables us to consolidate bad address handling in
- * hash_page.
- *
  * We also need to avoid the last segment of the last context, because that
  * would give a protovsid of 0x1f. That will result in a VSID 0
- * because of the modulo operation in vsid scramble. But the vmemmap
- * (which is what uses region 0xf) will never be close to 64TB in size
- * (it's 56 bytes per page of system memory).
+ * because of the modulo operation in vsid scramble.
  */
 
 #define CONTEXT_BITS   19
@@ -533,12 +527,15 @@ extern void slb_set_size(u16 size);
 /*
  * 256MB segment
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
- * available for user + kernel mapping. The top 4 contexts are used for
+ * available for user + kernel mapping. The bottom 4 contexts are used for
  * kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
+ * context maps 2^46 bytes (64TB).
+ *
+ * We also need to avoid the last segment of the last context, because that
+ * would give a protovsid of 0x1f. That will result in a VSID 0
+ * because of the modulo operation in vsid scramble.
  */
-#define MAX_USER_CONTEXT   ((ASM_CONST(1) << CONTEXT_BITS) - 5)
+#define MAX_USER_CONTEXT   ((ASM_CONST(1) << CONTEXT_BITS) - 2)
 
 /*
  * This should be computed such that protovosid * vsid_mulitplier
@@ -674,19 +671,19 @@ static inline unsigned long get_vsid(unsigned long 
context, unsigned long ea,
  * This is only valid for addresses >= PAGE_OFFSET
  *
  * For kernel space, we use the top 4 context ids to map address as below
- * 0x7fffc -  [ 0xc000 - 0xc0003fff ]
- * 0x7fffd -  [ 0xd000 - 0xd0003fff ]
- * 0x7fffe -  [ 0xe000 - 0xe0003fff ]
- * 0x7 -  [ 0xf000 - 0xf0003fff ]
+ * 0x0 -  [ 0xc000 - 0xc0003fff ]
+ * 0x1 -  [ 0xd000 - 0xd0003fff ]
+ * 0x2 -  [ 0xe000 - 0xe0003fff ]
+ * 0x3 -  [ 0xf000 - 0xf0003fff ]
  */
 static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 {
unsigned long context;
 
/*
-* kernel take the top 4 context from the available range
+* kernel take the first 4 context from the available range
 */
-   context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
+   context = (ea >> 60) - 

[PATCH V2 4/7] powerpc/mm/hash: Support 68 bit VA

2017-02-12 Thread Aneesh Kumar K.V
Inorder to support large effective address range (512TB), we want to increase
the virtual address bits to 68. But we do have platforms like p4 and p5 that can
only do 65 bit VA. We support those platforms by limiting context bits on them
to 16.

The protovsid -> vsid conversion is verified to work with both 65 and 68 bit
va values. I also documented the restrictions in a table format as part of code
comments.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 123 --
 arch/powerpc/include/asm/mmu.h|  19 ++--
 arch/powerpc/kvm/book3s_64_mmu_host.c |   8 +-
 arch/powerpc/mm/mmu_context_book3s64.c|   8 +-
 arch/powerpc/mm/slb_low.S |  54 +--
 5 files changed, 150 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 0a86cb2022d2..c7df4d60744e 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -42,6 +42,7 @@
 
 /* Bits in the SLB VSID word */
 #define SLB_VSID_SHIFT 12
+#define SLB_VSID_SHIFT_256M12
 #define SLB_VSID_SHIFT_1T  24
 #define SLB_VSID_SSIZE_SHIFT   62
 #define SLB_VSID_B ASM_CONST(0xc000)
@@ -518,9 +519,19 @@ extern void slb_set_size(u16 size);
  * because of the modulo operation in vsid scramble.
  */
 
+/*
+ * Max Va bits we support as of now is 68 bits. We want 19 bit
+ * context ID.
+ * Restrictions:
+ * GPU has restrictions of not able to access beyond 128TB
+ * (47 bit effective address). We also cannot do more than 20bit PID.
+ * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS
+ * to 16 bits (ie, we can only have 2^16 pids at the same time).
+ */
+#define VA_BITS68
 #define CONTEXT_BITS   19
-#define ESID_BITS  18
-#define ESID_BITS_1T   6
+#define ESID_BITS  (VA_BITS - (SID_SHIFT + CONTEXT_BITS))
+#define ESID_BITS_1T   (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS))
 
 #define ESID_BITS_MASK ((1 << ESID_BITS) - 1)
 #define ESID_BITS_1T_MASK  ((1 << ESID_BITS_1T) - 1)
@@ -529,62 +540,54 @@ extern void slb_set_size(u16 size);
  * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
  * available for user + kernel mapping. The bottom 4 contexts are used for
  * kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB).
+ * context maps 2^49 bytes (512TB).
  *
  * We also need to avoid the last segment of the last context, because that
  * would give a protovsid of 0x1f. That will result in a VSID 0
  * because of the modulo operation in vsid scramble.
  */
 #define MAX_USER_CONTEXT   ((ASM_CONST(1) << CONTEXT_BITS) - 2)
+/*
+ * For platforms that support on 65bit VA we limit the context bits
+ */
+#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + 
ESID_BITS))) - 2)
 
 /*
  * This should be computed such that protovosid * vsid_mulitplier
  * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
+ * We also need to make sure that number of bits in divisor is less
+ * than twice the number of protovsid bits for our modulus optmization to work.
+ * The below table shows the current values used.
+ *
+ * |---++++--|
+ * |   | Prime Bits | VSID_BITS_65VA | Total Bits | 2* VSID_BITS |
+ * |---++++--|
+ * | 1T| 24 | 25 | 49 |   50 |
+ * |---++++--|
+ * | 256MB | 24 | 37 | 61 |   74 |
+ * |---++++--|
+ *
+ * |---++++--|
+ * |   | Prime Bits | VSID_BITS_68VA | Total Bits | 2* VSID_BITS |
+ * |---++++--|
+ * | 1T| 24 | 28 | 52 |   56 |
+ * |---++++--|
+ * | 256MB | 24 | 40 | 64 |   80 |
+ * |---++++--|
+ *
  */
 #define VSID_MULTIPLIER_256M   ASM_CONST(12538073) /* 24-bit prime */
-#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS)
+#define VSID_BITS_256M (VA_BITS - SID_SHIFT)
 #define VSID_MODULUS_256M  ((1UL<=   \
-* 2^36-1, then r3+1 has the 2^36 bit set.  So, if r3+1 has \
-* the bit clear, r3 already has the answer we want, if it  \
-* doesn't, the answer is the low 36 bits of r3+1.  So in all   \
-* cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\
-   addirx,rt,1;\
-

[PATCH V2 5/7] powerpc/mm: Move copy_mm_to_paca to paca.c

2017-02-12 Thread Aneesh Kumar K.V
We will be updating this later to use struct mm_struct. Move this so that 
function
finds the definition of struct mm_struct;

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/paca.h | 18 +-
 arch/powerpc/kernel/paca.c  | 19 +++
 arch/powerpc/mm/hash_utils_64.c |  4 ++--
 arch/powerpc/mm/slb.c   |  2 +-
 arch/powerpc/mm/slice.c |  2 +-
 5 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 6a6792bb39fb..f25d3c93a30f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -207,23 +207,7 @@ struct paca_struct {
 #endif
 };
 
-#ifdef CONFIG_PPC_BOOK3S
-static inline void copy_mm_to_paca(mm_context_t *context)
-{
-   get_paca()->mm_ctx_id = context->id;
-#ifdef CONFIG_PPC_MM_SLICES
-   get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
-   memcpy(&get_paca()->mm_ctx_high_slices_psize,
-  &context->high_slices_psize, SLICE_ARRAY_SIZE);
-#else
-   get_paca()->mm_ctx_user_psize = context->user_psize;
-   get_paca()->mm_ctx_sllp = context->sllp;
-#endif
-}
-#else
-static inline void copy_mm_to_paca(mm_context_t *context){}
-#endif
-
+extern void copy_mm_to_paca(struct mm_struct *mm);
 extern struct paca_struct *paca;
 extern void initialise_paca(struct paca_struct *new_paca, int cpu);
 extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index fa20060ff7a5..b64daf124fee 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -244,3 +244,22 @@ void __init free_unused_pacas(void)
 
free_lppacas();
 }
+
+void copy_mm_to_paca(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_BOOK3S
+   mm_context_t *context = &mm->context;
+
+   get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+   get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+   memcpy(&get_paca()->mm_ctx_high_slices_psize,
+  &context->high_slices_psize, SLICE_ARRAY_SIZE);
+#else /* CONFIG_PPC_MM_SLICES */
+   get_paca()->mm_ctx_user_psize = context->user_psize;
+   get_paca()->mm_ctx_sllp = context->sllp;
+#endif
+#else /* CONFIG_PPC_BOOK3S */
+   return;
+#endif
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 978314b6b8d7..67937a6eb541 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1084,7 +1084,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned 
long addr)
copro_flush_all_slbs(mm);
if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
 
-   copy_mm_to_paca(&mm->context);
+   copy_mm_to_paca(mm);
slb_flush_and_rebolt();
}
 }
@@ -1156,7 +1156,7 @@ static void check_paca_psize(unsigned long ea, struct 
mm_struct *mm,
 {
if (user_region) {
if (psize != get_paca_psize(ea)) {
-   copy_mm_to_paca(&mm->context);
+   copy_mm_to_paca(mm);
slb_flush_and_rebolt();
}
} else if (get_paca()->vmalloc_sllp !=
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 48fc28bab544..15157b14b0b6 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -227,7 +227,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct 
*mm)
asm volatile("slbie %0" : : "r" (slbie_data));
 
get_paca()->slb_cache_ptr = 0;
-   copy_mm_to_paca(&mm->context);
+   copy_mm_to_paca(mm);
 
/*
 * preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 0ec750adbe7f..116868bb91f5 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -190,7 +190,7 @@ static void slice_flush_segments(void *parm)
if (mm != current->active_mm)
return;
 
-   copy_mm_to_paca(¤t->active_mm->context);
+   copy_mm_to_paca(current->active_mm);
 
local_irq_save(flags);
slb_flush_and_rebolt();
-- 
2.7.4



[PATCH V2 7/7] powerpc/mm/hash: Increase VA range to 128TB

2017-02-12 Thread Aneesh Kumar K.V
We update the hash linux page table layout such that we can support 512TB. But
we limit the TASK_SIZE to 128TB. We can switch to 128TB by default without
conditional because that is the max virtual address supported by other
architectures. We will later add a mechanism to on-demand increase the
application's effective address range to 512TB.

Having the page table layout changed to accommodate 512TB  makes testing large
memory configuration easier with less code changes to kernel

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 +-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  2 +-
 arch/powerpc/include/asm/page_64.h|  2 +-
 arch/powerpc/include/asm/processor.h  | 22 ++
 arch/powerpc/kernel/paca.c|  9 -
 arch/powerpc/mm/slice.c   |  2 ++
 6 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..b4b5e6b671ca 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -8,7 +8,7 @@
 #define H_PTE_INDEX_SIZE  9
 #define H_PMD_INDEX_SIZE  7
 #define H_PUD_INDEX_SIZE  9
-#define H_PGD_INDEX_SIZE  9
+#define H_PGD_INDEX_SIZE  12
 
 #ifndef __ASSEMBLY__
 #define H_PTE_TABLE_SIZE   (sizeof(pte_t) << H_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index b39f0b86405e..682c4eb28fa4 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -4,7 +4,7 @@
 #define H_PTE_INDEX_SIZE  8
 #define H_PMD_INDEX_SIZE  5
 #define H_PUD_INDEX_SIZE  5
-#define H_PGD_INDEX_SIZE  12
+#define H_PGD_INDEX_SIZE  15
 
 /*
  * 64k aligned address free up few of the lower bits of RPN for us
diff --git a/arch/powerpc/include/asm/page_64.h 
b/arch/powerpc/include/asm/page_64.h
index 7f72659b7999..9b60e9455c6e 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -107,7 +107,7 @@ extern u64 ppc64_pft_size;
  */
 struct slice_mask {
u16 low_slices;
-   DECLARE_BITMAP(high_slices, 64);
+   DECLARE_BITMAP(high_slices, 512);
 };
 
 struct mm_struct;
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 1ba814436c73..1d4e34f9004d 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -102,11 +102,25 @@ void release_thread(struct task_struct *);
 #endif
 
 #ifdef CONFIG_PPC64
-/* 64-bit user address space is 46-bits (64TB user VM) */
-#define TASK_SIZE_USER64 (0x4000UL)
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB  (0x4000UL)
+#define TASK_SIZE_128TB (0x8000UL)
+#define TASK_SIZE_512TB (0x0002UL)
 
-/* 
- * 32-bit user address space is 4GB - 1 page 
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * MAx value currently used:
+ */
+#define TASK_SIZE_USER64 TASK_SIZE_128TB
+#else
+#define TASK_SIZE_USER64 TASK_SIZE_64TB
+#endif
+
+/*
+ * 32-bit user address space is 4GB - 1 page
  * (this 1 page is needed so referencing of 0x generates EFAULT
  */
 #define TASK_SIZE_USER32 (0x0001UL - (1*PAGE_SIZE))
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index b64daf124fee..c7ca70dc3ba5 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -253,8 +253,15 @@ void copy_mm_to_paca(struct mm_struct *mm)
get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+   /*
+* We support upto 128TB for now. Hence copy only 128/2 bytes.
+* Later when we support tasks with different max effective
+* address, we can optimize this based on mm->task_size.
+*/
+   BUILD_BUG_ON(TASK_SIZE_USER64 != TASK_SIZE_128TB);
memcpy(&get_paca()->mm_ctx_high_slices_psize,
-  &context->high_slices_psize, SLICE_ARRAY_SIZE);
+  &context->high_slices_psize, TASK_SIZE_128TB >> 41);
+
 #else /* CONFIG_PPC_MM_SLICES */
get_paca()->mm_ctx_user_psize = context->user_psize;
get_paca()->mm_ctx_sllp = context->sllp;
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 116868bb91f5..b3f45e413a60 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -407,6 +407,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
struct mm_struct *mm = current->mm;
unsigned long newaddr;
 
+   /* Make sure high_slices bitmap size is same as we expected */
+   BUILD_BUG_ON(512 != SLICE_NUM_HIGH);
/*
 * init different masks
 */
-- 
2.7.4



[PATCH V2 6/7] powerpc/mm: Remove redundant TASK_SIZE_USER64 checks

2017-02-12 Thread Aneesh Kumar K.V
The check against VSID range is implied when we check task size against
hash and radix pgtable range[1], because we make sure page table range cannot
exceed vsid range.

[1] BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);

The check for smaller task size is also removed here, because the follow up
patch will support a tasksize smaller than pgtable range.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c| 4 
 arch/powerpc/mm/pgtable_64.c | 5 -
 2 files changed, 9 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 93abf8a9813d..f3e856e6ee23 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -69,10 +69,6 @@
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-
-#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < 
USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
 phys_addr_t memstart_addr = ~0;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 8bca7f58afc4..06e23e0b1b81 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -55,11 +55,6 @@
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_STD_MMU_64
-#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
-#error TASK_SIZE_USER64 exceeds user VSID range
-#endif
-#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 /*
-- 
2.7.4



[PATCH] powerpc/xmon: add turn off xmon option

2017-02-12 Thread Pan Xinhui
Once xmon is triggered, there is no interface to turn it off again.
However there exists disable/enable xmon code flows. And more important,
System reset interrupt on powerVM will fire an oops to make a dump. At
that time, xmon should not be triggered.

So add 'z' option after current 'x|X' exit commands. Turn xmon off if 'z'
is following.

Signed-off-by: Pan Xinhui 
---
 arch/powerpc/xmon/xmon.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17c..2f4e7b1 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,6 +76,7 @@ static int xmon_gate;
 #endif /* CONFIG_SMP */
 
 static unsigned long in_xmon __read_mostly = 0;
+static int xmon_off = 0;
 
 static unsigned long adrs;
 static int size = 1;
@@ -255,8 +256,8 @@ Commands:\n\
   Sr # read SPR #\n\
   Sw #v write v to SPR #\n\
   tprint backtrace\n\
-  xexit monitor and recover\n\
-  Xexit monitor and don't recover\n"
+  x[z] exit monitor and recover, turn off xmon with 'z'\n\
+  X[z] exit monitor and don't recover, turn off xmon with 'z'\n"
 #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E)
 "  u   dump segment table or SLB\n"
 #elif defined(CONFIG_PPC_STD_MMU_32)
@@ -952,6 +953,8 @@ cmds(struct pt_regs *excp)
break;
case 'x':
case 'X':
+   if (inchar() == 'z')
+   xmon_off = 1;
return cmd;
case EOF:
printf(" \n");
@@ -3248,8 +3251,11 @@ static void xmon_init(int enable)
 static void sysrq_handle_xmon(int key)
 {
/* ensure xmon is enabled */
+   xmon_off = 0;
xmon_init(1);
debugger(get_irq_regs());
+   if (xmon_off)
+   xmon_init(0);
 }
 
 static struct sysrq_key_op sysrq_xmon_op = {
@@ -3266,7 +3272,7 @@ static int __init setup_xmon_sysrq(void)
 __initcall(setup_xmon_sysrq);
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-static int __initdata xmon_early, xmon_off;
+static int __initdata xmon_early;
 
 static int __init early_parse_xmon(char *p)
 {
-- 
2.4.11



Re: [PATCH 1/2] powerpc/powernv/opal-dump : Handles opal_dump_info properly

2017-02-12 Thread Mukesh Ojha

Hi Micheal,

Can you please look at this patchset?

-Mukesh


On Tuesday 06 December 2016 12:07 PM, Mukesh Ojha wrote:

Hi Michael,

Can you please have a look at this patchset as there is no

functional changes involve with this?

Thanks,
Mukesh




On Thursday 01 December 2016 02:38 PM, Mukesh Ojha wrote:

Moves the return value check of 'opal_dump_info' to a proper place which
was previously unnecessarily filling all the dump info even on failure.

Signed-off-by: Mukesh Ojha 
---
  arch/powerpc/platforms/powernv/opal-dump.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-dump.c 
b/arch/powerpc/platforms/powernv/opal-dump.c

index 4c82782..ae32212 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -225,13 +225,16 @@ static int64_t dump_read_info(uint32_t 
*dump_id, uint32_t *dump_size, uint32_t *

  if (rc == OPAL_PARAMETER)
  rc = opal_dump_info(&id, &size);

+if (rc) {
+pr_warn("%s: Failed to get dump info (%d)\n",
+__func__, rc);
+return rc;
+}
+
  *dump_id = be32_to_cpu(id);
  *dump_size = be32_to_cpu(size);
  *dump_type = be32_to_cpu(type);

-if (rc)
-pr_warn("%s: Failed to get dump info (%d)\n",
-__func__, rc);
  return rc;
  }