[PATCH v2 2/6] powerpc/perf: Add PM_INST_DISP event to Power9 event list
Signed-off-by: Madhavan Srinivasan --- Changelog v1: Fix the event code. arch/powerpc/perf/power9-events-list.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 929b56d47ad9..71a6bfee5c02 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -53,3 +53,6 @@ EVENT(PM_ITLB_MISS, 0x400fc) EVENT(PM_RUN_INST_CMPL,0x500fa) /* Run_cycles */ EVENT(PM_RUN_CYC, 0x600f4) +/* Instruction Dispatched */ +EVENT(PM_INST_DISP,0x200f2) +EVENT(PM_INST_DISP_ALT,0x300f2) -- 2.7.4
[PATCH v2 1/6] powerpc/perf: Factor of event_alternative function
Factor out the power8 event_alternative function to share the code with power9. Signed-off-by: Madhavan Srinivasan --- Changelog v1: No changes to this patch, just a rebase arch/powerpc/perf/isa207-common.c | 36 arch/powerpc/perf/isa207-common.h | 3 +++ arch/powerpc/perf/power8-pmu.c| 35 ++- 3 files changed, 41 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 50e598cf644b..a86fadee352b 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -338,3 +338,39 @@ void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]) if (pmc <= 3) mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1)); } + +static int find_alternative(u64 event, const unsigned int ev_alt[][MAX_ALT], int size) +{ + int i, j; + + for (i = 0; i < size; ++i) { + if (event < ev_alt[i][0]) + break; + + for (j = 0; j < MAX_ALT && ev_alt[i][j]; ++j) + if (event == ev_alt[i][j]) + return i; + } + + return -1; +} + +int isa207_get_alternatives(u64 event, u64 alt[], + const unsigned int ev_alt[][MAX_ALT], int size) +{ + int i, j, num_alt = 0; + u64 alt_event; + + alt[num_alt++] = event; + i = find_alternative(event, ev_alt, size); + if (i >= 0) { + /* Filter out the original event, it's already in alt[0] */ + for (j = 0; j < MAX_ALT; ++j) { + alt_event = ev_alt[i][j]; + if (alt_event && alt_event != event) + alt[num_alt++] = alt_event; + } + } + + return num_alt; +} diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 90495f1580c7..3e9150f6690a 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -260,5 +260,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], unsigned long mmcr[], struct perf_event *pevents[]); void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]); +int isa207_get_alternatives(u64 event, u64 alt[], + const unsigned int ev_alt[][MAX_ALT], int size); + #endif diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index d07186382f3a..ce15b19a7962 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -48,43 +48,12 @@ static const unsigned int event_alternatives[][MAX_ALT] = { { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; -/* - * Scan the alternatives table for a match and return the - * index into the alternatives table if found, else -1. - */ -static int find_alternative(u64 event) -{ - int i, j; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - break; - - for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) - if (event == event_alternatives[i][j]) - return i; - } - - return -1; -} - static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, num_alt = 0; - u64 alt_event; - - alt[num_alt++] = event; - - i = find_alternative(event); - if (i >= 0) { - /* Filter out the original event, it's already in alt[0] */ - for (j = 0; j < MAX_ALT; ++j) { - alt_event = event_alternatives[i][j]; - if (alt_event && alt_event != event) - alt[num_alt++] = alt_event; - } - } + num_alt = isa207_get_alternatives(event, alt, event_alternatives, + (int)ARRAY_SIZE(event_alternatives)); if (flags & PPMU_ONLY_COUNT_RUN) { /* * We're only counting in RUN state, so PM_CYC is equivalent to -- 2.7.4
[PATCH v2 3/6] powerpc/perf: Add alternative event table and function for power9
Signed-off-by: Madhavan Srinivasan --- Change v1: No changes, just a rebase arch/powerpc/perf/power9-pmu.c | 17 + 1 file changed, 17 insertions(+) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 7332634e18c9..b38acff8a791 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -106,6 +106,21 @@ enum { /* PowerISA v2.07 format attribute structure*/ extern struct attribute_group isa207_pmu_format_group; +/* Table of alternatives, sorted by column 0 */ +static const unsigned int power9_event_alternatives[][MAX_ALT] = { + { PM_INST_DISP, PM_INST_DISP_ALT }, +}; + +static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ + int num_alt = 0; + + num_alt = isa207_get_alternatives(event, alt, power9_event_alternatives, + (int)ARRAY_SIZE(power9_event_alternatives)); + + return num_alt; +} + GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); GENERIC_EVENT_ATTR(stalled-cycles-frontend,PM_ICT_NOSLOT_CYC); GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL); @@ -383,6 +398,7 @@ static struct power_pmu power9_isa207_pmu = { .config_bhrb= power9_config_bhrb, .bhrb_filter_map= power9_bhrb_filter_map, .get_constraint = isa207_get_constraint, + .get_alternatives = power9_get_alternatives, .disable_pmc= isa207_disable_pmc, .flags = PPMU_NO_SIAR | PPMU_ARCH_207S, .n_generic = ARRAY_SIZE(power9_generic_events), @@ -401,6 +417,7 @@ static struct power_pmu power9_pmu = { .config_bhrb= power9_config_bhrb, .bhrb_filter_map= power9_bhrb_filter_map, .get_constraint = isa207_get_constraint, + .get_alternatives = power9_get_alternatives, .disable_pmc= isa207_disable_pmc, .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, .n_generic = ARRAY_SIZE(power9_generic_events), -- 2.7.4
[PATCH v2 4/6] powerpc/perf: Use PM_INST_DISP for generic instructions sample
Since PM_INST_CMPL may not provide right counts in all sampling scenarios in power9 DD1, instead use PM_INST_DISP. Patch also update generic instruction sampling with the same. Signed-off-by: Madhavan Srinivasan --- Changelog v1: Based on DD1 check, modified the event code to use for "instructions" arch/powerpc/perf/power9-pmu.c | 20 ++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index b38acff8a791..454e9f70894f 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -228,6 +228,17 @@ static const struct attribute_group *power9_pmu_attr_groups[] = { NULL, }; +static int power9_generic_events_dd1[] = { + [PERF_COUNT_HW_CPU_CYCLES] =PM_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =PM_CMPLU_STALL, + [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_DISP, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_CMPL, + [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, + [PERF_COUNT_HW_CACHE_REFERENCES] = PM_LD_REF_L1, + [PERF_COUNT_HW_CACHE_MISSES] = PM_LD_MISS_L1_FIN, +}; + static int power9_generic_events[] = { [PERF_COUNT_HW_CPU_CYCLES] =PM_CYC, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_ICT_NOSLOT_CYC, @@ -401,8 +412,8 @@ static struct power_pmu power9_isa207_pmu = { .get_alternatives = power9_get_alternatives, .disable_pmc= isa207_disable_pmc, .flags = PPMU_NO_SIAR | PPMU_ARCH_207S, - .n_generic = ARRAY_SIZE(power9_generic_events), - .generic_events = power9_generic_events, + .n_generic = ARRAY_SIZE(power9_generic_events_dd1), + .generic_events = power9_generic_events_dd1, .cache_events = &power9_cache_events, .attr_groups= power9_isa207_pmu_attr_groups, .bhrb_nr= 32, @@ -437,6 +448,11 @@ static int __init init_power9_pmu(void) return -ENODEV; if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* +* Since PM_INST_CMPL may not provide right counts in all +* sampling scenarios in power9 DD1, instead use PM_INST_DISP. +*/ + EVENT_VAR(PM_INST_CMPL, _g).id = PM_INST_DISP; rc = register_power_pmu(&power9_isa207_pmu); } else { rc = register_power_pmu(&power9_pmu); -- 2.7.4
[PATCH v2 5/6] powerpc/perf: Use Instruction Counter value
Since PM_INST_DISP include speculative instruction, based on the workload the dispatch count could vary considerably. Hence as an alternative, for completed instruction counting, program the PM_INST_DISP event to the MMCR* but use Instruction Counter register value. Signed-off-by: Madhavan Srinivasan --- Changelog v1: 1)Removed the #ifdef and added changes for EBB count updates arch/powerpc/perf/core-book3s.c | 36 1 file changed, 36 insertions(+) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 270eb9b74e2e..87d17a1f7168 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -57,6 +57,7 @@ struct cpu_hw_events { void*bhrb_context; struct perf_branch_stack bhrb_stack; struct perf_branch_entry bhrb_entries[BHRB_MAX_ENTRIES]; + u64 ic_init; }; static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); @@ -127,6 +128,10 @@ static inline void power_pmu_bhrb_disable(struct perf_event *event) {} static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {} static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } +static bool use_ic(u64 event) +{ + return false; +} #endif /* CONFIG_PPC32 */ static bool regs_use_siar(struct pt_regs *regs) @@ -688,6 +693,15 @@ static void pmao_restore_workaround(bool ebb) mtspr(SPRN_PMC5, pmcs[4]); mtspr(SPRN_PMC6, pmcs[5]); } + +static bool use_ic(u64 event) +{ + if (cpu_has_feature(CPU_FTR_POWER9_DD1) && + (event == 0x200f2 || event == 0x300f2)) + return true; + + return false; +} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -1007,6 +1021,7 @@ static u64 check_and_compute_delta(u64 prev, u64 val) static void power_pmu_read(struct perf_event *event) { s64 val, delta, prev; + struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events); if (event->hw.state & PERF_HES_STOPPED) return; @@ -1016,6 +1031,13 @@ static void power_pmu_read(struct perf_event *event) if (is_ebb_event(event)) { val = read_pmc(event->hw.idx); + if (use_ic(event->attr.config)) { + val = mfspr(SPRN_IC); + if (val > cpuhw->ic_init) + val = val - cpuhw->ic_init; + else + val = val + (0 - cpuhw->ic_init); + } local64_set(&event->hw.prev_count, val); return; } @@ -1029,6 +1051,13 @@ static void power_pmu_read(struct perf_event *event) prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); + if (use_ic(event->attr.config)) { + val = mfspr(SPRN_IC); + if (val > cpuhw->ic_init) + val = val - cpuhw->ic_init; + else + val = val + (0 - cpuhw->ic_init); + } delta = check_and_compute_delta(prev, val); if (!delta) return; @@ -1466,6 +1495,13 @@ static int power_pmu_add(struct perf_event *event, int ef_flags) event->attr.branch_sample_type); } + /* +* Workaround for POWER9 DD1 to use the Instruction Counter +* register value for instruction counting +*/ + if (use_ic(event->attr.config)) + cpuhw->ic_init = mfspr(SPRN_IC); + perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; -- 2.7.4
[PATCH v2 6/6] powerpc/perf: Add restrictions to PMC5 in power9 DD1
PMC5 on POWER9 DD1 may not provide right counts in all sampling scenarios, hence use PM_INST_DISP event instead in PMC2 or PMC3 in preference. Signed-off-by: Madhavan Srinivasan --- Changelog v1: No changes, just a rebase arch/powerpc/perf/isa207-common.h | 4 arch/powerpc/perf/power9-pmu.c| 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 3e9150f6690a..cf9bd8990159 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -222,6 +222,10 @@ CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \ CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL +/* + * Lets restrict use of PMC5 for instruction counting. + */ +#define P9_DD1_TEST_ADDER (ISA207_TEST_ADDER | CNST_PMC_VAL(5)) /* Bits in MMCR1 for PowerISA v2.07 */ #define MMCR1_UNIT_SHIFT(pmc) (60 - (4 * ((pmc) - 1))) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 454e9f70894f..5fe9cb1dc3b6 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -423,7 +423,7 @@ static struct power_pmu power9_pmu = { .name = "POWER9", .n_counter = MAX_PMU_COUNTERS, .add_fields = ISA207_ADD_FIELDS, - .test_adder = ISA207_TEST_ADDER, + .test_adder = P9_DD1_TEST_ADDER, .compute_mmcr = isa207_compute_mmcr, .config_bhrb= power9_config_bhrb, .bhrb_filter_map= power9_bhrb_filter_map, -- 2.7.4
[PATCH v2] powerpc/perf: Add constraints for power9 l2/l3 bus events
In Power9, L2/L3 bus events are always available as a "bank" of 4 events. To obtain the counts for any of the l2/l3 bus events in a given bank, the user will have to program PMC4 with corresponding l2/l3 bus event for that bank. Patch add a mask and a new pass to updates the mask for each PMU used by L2/L3 bus events and checks the mask to enforce it. Signed-off-by: Madhavan Srinivasan --- Changelog v1: Removed the callback and added a new pass arch/powerpc/perf/isa207-common.c | 20 +++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 50e598cf644b..ce12e93dbd16 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -219,7 +219,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev, struct perf_event *pevents[]) { unsigned long mmcra, mmcr1, mmcr2, unit, combine, psel, cache, val; - unsigned int pmc, pmc_inuse; + unsigned int pmc, pmc_inuse, mask=0; int i; pmc_inuse = 0; @@ -312,6 +312,24 @@ int isa207_compute_mmcr(u64 event[], int n_ev, hwc[i] = pmc - 1; } +/* + * Pass 3: to Check for l2/l3 bus event rule. PMC4 + * must be programmed to use L2/L3 bus events in any other PMC[1/2/3]s + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit= (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + if (unit >= 6 && unit <= 9) + mask |= 1 << (pmc - 1); + } + + if ((mask) && ((mask & 0xf) < 0x8)) { + printk(KERN_ERR "Missing PMC4 L2/L3 Bus event\n"); + return -1; + } + } + /* Return MMCRx values */ mmcr[0] = 0; -- 2.7.4
Re: [PATCH] powerpc: Blacklist GCC 5.4 6.1 and 6.2
On Fri, 2017-02-10 at 08:48 +0100, Christophe LEROY wrote: > > Le 10/02/2017 à 06:31, Cyril Bur a écrit : > > A bug in the -02 optimisation of GCC 5.4 6.1 and 6.2 causes > > setup_command_line() to not pass the correct first argument to strcpy > > and therefore not actually copy the command_line. > > > > A workaround patch was proposed: http://patchwork.ozlabs.org/patch/673130/ > > some discussion ensued. > > > > A GCC bug was raised: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709 > > The bug has been fixed in 7.0 and backported to GCC 5 and GCC 6. > > > > At the time of writing GCC 5.4 is the most recent and is affected. GCC > > 6.3 contains the backported fix, has been tested and appears safe to > > use. > > > > Heavy-lifting-by: Akshay Adiga > > Signed-off-by: Cyril Bur > > --- > > arch/powerpc/Makefile | 9 + > > 1 file changed, 9 insertions(+) > > > > diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile > > index 31286fa7873c..a4b886694391 100644 > > --- a/arch/powerpc/Makefile > > +++ b/arch/powerpc/Makefile > > @@ -414,6 +414,15 @@ checkbin: > > echo -n '*** Please use a different binutils version.' ; \ > > false ; \ > > fi > > + @if test "$(cc-version)" = "0504" \ > > + || test "$(cc-version)" = "0601" \ > > + || test "$(cc-version)" = "0602" ; then \ > > + echo -n '*** GCC-5.4 6.1 6.2 have a bad -O2 optimisation ' ; \ > > + echo 'which will cause lost commandline options (at least).' ; \ > > + echo '*** Please use a different GCC version.' ; \ > > + false ; \ > > + fi > > + > > According to the GCC bug report, this bug applies to powerpc64le > Why force all targets to not use those versions of GCC ? > True, I'll double check on BE, these days BE does suffer from, "oops I didn't check that". If it is only LE, I'll add another condition. Thanks, Cyril > Christophe
Re: [PATCH] powernv/opal: Handle OPAL_WRONG_STATE error from OPAL fails
Vipin K Parashar writes: > OPAL returns OPAL_WRONG_STATE for XSCOM operations > > done to read any core FIR which is sleeping, offline. OK. Do we know why Linux is causing that to happen? It's also returned from many of the XIVE routines if we're in the wrong xive mode, all of which would indicate a fairly bad Linux bug. Also the skiboot patch which added WRONG_STATE for XSCOM ops did so explicitly so we could differentiate from other errors: commit 9c2d82394fd2303847cac4a665dee62556ca528a Author: Russell Currey AuthorDate: Mon Mar 21 12:00:00 2016 +1100 xscom: Return OPAL_WRONG_STATE on XSCOM ops if CPU is asleep xscom_read and xscom_write return OPAL_SUCCESS if they worked, and OPAL_HARDWARE if they didn't. This doesn't provide information about why the operation failed, such as if the CPU happens to be asleep. This is specifically useful in error scanning, so if every CPU is being scanned for errors, sleeping CPUs likely aren't the cause of failures. So, return OPAL_WRONG_STATE in xscom_read and xscom_write if the CPU is sleeping. Signed-off-by: Russell Currey Reviewed-by: Alistair Popple Signed-off-by: Stewart Smith So I'm still not convinced that quietly swallowing this error and mapping it to -EIO along with several of the other error codes is the right thing to do. cheers
[PATCH v2] powerpc: Blacklist GCC 5.4 6.1 and 6.2
A bug in the -02 optimisation of GCC 5.4 6.1 and 6.2 causes setup_command_line() to not pass the correct first argument to strcpy and therefore not actually copy the command_line. A workaround patch was proposed: http://patchwork.ozlabs.org/patch/673130/ some discussion ensued. A GCC bug was raised: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71709 The bug has been fixed in 7.0 and backported to GCC 5 and GCC 6. At the time of writing GCC 5.4 is the most recent and is affected. GCC 6.3 contains the backported fix, has been tested and appears safe to use. Heavy-lifting-by: Akshay Adiga Signed-off-by: Cyril Bur --- v2: Added check to only blacklist compilers on little-endian arch/powerpc/Makefile | 11 +++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 31286fa7873c..db5d8dabf1ca 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -381,6 +381,7 @@ TOUT:= .tmp_gas_check # - gcc-3.4 and binutils-2.14 are a fatal combination # - Require gcc 4.0 or above on 64-bit # - gcc-4.2.0 has issues compiling modules on 64-bit +# - gcc-5.4, 6.1, 6.2 don't copy the command_line around correctly checkbin: @if test "$(cc-name)" != "clang" \ && test "$(cc-version)" = "0304" ; then \ @@ -414,6 +415,16 @@ checkbin: echo -n '*** Please use a different binutils version.' ; \ false ; \ fi + @if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \ + && { test "$(cc-version)" = "0504" \ + || test "$(cc-version)" = "0601" \ + || test "$(cc-version)" = "0602" ; } ; then \ + echo -n '*** GCC-5.4 6.1 6.2 have a bad -O2 optimisation ' ; \ + echo 'which will cause lost command_line options (at least).' ; \ + echo '*** Please use a different GCC version.' ; \ + false ; \ + fi + CLEAN_FILES += $(TOUT) -- 2.11.1
[PATCH V2 0/7] powerpc/mm/ppc64: Add 128TB support
This patch series increase the effective virtual address range of applications from 64TB to 128TB. We do that by supporting a 68 bit virtual address. On platforms that can only do 65 bit virtual address we limit the max contexts to a 16bit value instead of 19. The patch series also switch the page table layout such that we can do 512TB effective address. But we still limit the TASK_SIZE to 128TB. This was done to make sure we don't break applications that make assumption regarding the max address returned by the OS. We can switch to 128TB without a linux personality value because other architectures do 128TB as max address. Aneesh Kumar K.V (7): powerpc/mm/slice: Convert slice_mask high slice to a bitmap powerpc/mm/slice: Update the function prototype powerpc/mm/hash: Move kernel context to the starting of context range powerpc/mm/hash: Support 68 bit VA powerpc/mm: Move copy_mm_to_paca to paca.c powerpc/mm: Remove redundant TASK_SIZE_USER64 checks powerpc/mm/hash: Increase VA range to 128TB arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 +- arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 +- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 160 +++--- arch/powerpc/include/asm/mmu.h| 19 ++- arch/powerpc/include/asm/mmu_context.h| 2 +- arch/powerpc/include/asm/paca.h | 18 +-- arch/powerpc/include/asm/page_64.h| 15 +-- arch/powerpc/include/asm/processor.h | 22 +++- arch/powerpc/kernel/paca.c| 26 + arch/powerpc/kvm/book3s_64_mmu_host.c | 10 +- arch/powerpc/mm/hash_utils_64.c | 9 +- arch/powerpc/mm/init_64.c | 4 - arch/powerpc/mm/mmu_context_book3s64.c| 96 +++- arch/powerpc/mm/pgtable_64.c | 5 - arch/powerpc/mm/slb.c | 2 +- arch/powerpc/mm/slb_low.S | 74 arch/powerpc/mm/slice.c | 149 ++-- 17 files changed, 379 insertions(+), 236 deletions(-) -- 2.7.4
[PATCH V2 1/7] powerpc/mm/slice: Convert slice_mask high slice to a bitmap
In followup patch we want to increase the va range which will result in us requiring high_slices to have more than 64 bits. To enable this convert high_slices to bitmap. We keep the number bits same in this patch and later change that to higher value Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/page_64.h | 15 +++--- arch/powerpc/mm/slice.c| 106 - 2 files changed, 76 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index dd5f0712afa2..7f72659b7999 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -98,19 +98,16 @@ extern u64 ppc64_pft_size; #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) +#ifndef __ASSEMBLY__ /* - * 1 bit per slice and we have one slice per 1TB - * Right now we support only 64TB. - * IF we change this we will have to change the type - * of high_slices + * One bit per slice. We have lower slices which cover 256MB segments + * upto 4G range. That gets us 16 low slices. For the rest we track slices + * in 1TB size. + * 64 below is actually SLICE_NUM_HIGH to fixup complie errros */ -#define SLICE_MASK_SIZE 8 - -#ifndef __ASSEMBLY__ - struct slice_mask { u16 low_slices; - u64 high_slices; + DECLARE_BITMAP(high_slices, 64); }; struct mm_struct; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 2b27458902ee..84dfb2b58870 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -36,11 +36,6 @@ #include #include -/* some sanity checks */ -#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error H_PGTABLE_RANGE exceeds slice_mask high_slices size -#endif - static DEFINE_SPINLOCK(slice_convert_lock); @@ -49,7 +44,7 @@ int _slice_debug = 1; static void slice_print_mask(const char *label, struct slice_mask mask) { - char*p, buf[16 + 3 + 64 + 1]; + char*p, buf[SLICE_NUM_LOW + 3 + SLICE_NUM_HIGH + 1]; int i; if (!_slice_debug) @@ -60,8 +55,12 @@ static void slice_print_mask(const char *label, struct slice_mask mask) *(p++) = ' '; *(p++) = '-'; *(p++) = ' '; - for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + if (test_bit(i, mask.high_slices)) + *(p++) = '1'; + else + *(p++) = '0'; + } *(p++) = 0; printk(KERN_DEBUG "%s:%s\n", label, buf); @@ -80,7 +79,10 @@ static struct slice_mask slice_range_to_mask(unsigned long start, unsigned long len) { unsigned long end = start + len - 1; - struct slice_mask ret = { 0, 0 }; + struct slice_mask ret; + + ret.low_slices = 0; + bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); if (start < SLICE_LOW_TOP) { unsigned long mend = min(end, SLICE_LOW_TOP); @@ -91,9 +93,8 @@ static struct slice_mask slice_range_to_mask(unsigned long start, } if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1ul << GET_HIGH_SLICE_INDEX(start)); - + bitmap_set(ret.high_slices, GET_HIGH_SLICE_INDEX(start), + GET_HIGH_SLICE_INDEX(len)); return ret; } @@ -130,9 +131,12 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) static struct slice_mask slice_mask_for_free(struct mm_struct *mm) { - struct slice_mask ret = { 0, 0 }; + struct slice_mask ret; unsigned long i; + ret.low_slices = 0; + bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); + for (i = 0; i < SLICE_NUM_LOW; i++) if (!slice_low_has_vma(mm, i)) ret.low_slices |= 1u << i; @@ -142,7 +146,7 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm) for (i = 0; i < SLICE_NUM_HIGH; i++) if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1ul << i; + __set_bit(i, ret.high_slices); return ret; } @@ -151,10 +155,13 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) { unsigned char *hpsizes; int index, mask_index; - struct slice_mask ret = { 0, 0 }; + struct slice_mask ret; unsigned long i; u64 lpsizes; + ret.low_slices = 0; + bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); + lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) if (((lpsizes >> (i * 4)) & 0xf) == psize) @@ -165,7 +172,7 @@ static struct slice_mask slice_mask_for_size(struct mm_struct
[PATCH V2 2/7] powerpc/mm/slice: Update the function prototype
This avoid copying the slice_mask struct as function return value Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/slice.c | 63 +++-- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 84dfb2b58870..0ec750adbe7f 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -75,27 +75,26 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {} #endif -static struct slice_mask slice_range_to_mask(unsigned long start, -unsigned long len) +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) { unsigned long end = start + len - 1; - struct slice_mask ret; - ret.low_slices = 0; - bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); if (start < SLICE_LOW_TOP) { unsigned long mend = min(end, SLICE_LOW_TOP); unsigned long mstart = min(start, SLICE_LOW_TOP); - ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - (1u << GET_LOW_SLICE_INDEX(mstart)); } if ((start + len) > SLICE_LOW_TOP) - bitmap_set(ret.high_slices, GET_HIGH_SLICE_INDEX(start), + bitmap_set(ret->high_slices, GET_HIGH_SLICE_INDEX(start), GET_HIGH_SLICE_INDEX(len)); - return ret; + return; } static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, @@ -129,53 +128,47 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) return !slice_area_is_free(mm, start, end - start); } -static struct slice_mask slice_mask_for_free(struct mm_struct *mm) +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) { - struct slice_mask ret; unsigned long i; - ret.low_slices = 0; - bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); for (i = 0; i < SLICE_NUM_LOW; i++) if (!slice_low_has_vma(mm, i)) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; if (mm->task_size <= SLICE_LOW_TOP) - return ret; + return; for (i = 0; i < SLICE_NUM_HIGH; i++) if (!slice_high_has_vma(mm, i)) - __set_bit(i, ret.high_slices); - - return ret; + __set_bit(i, ret->high_slices); } -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret) { unsigned char *hpsizes; int index, mask_index; - struct slice_mask ret; unsigned long i; u64 lpsizes; - ret.low_slices = 0; - bitmap_zero(ret.high_slices, SLICE_NUM_HIGH); + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) if (((lpsizes >> (i * 4)) & 0xf) == psize) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; hpsizes = mm->context.high_slices_psize; for (i = 0; i < SLICE_NUM_HIGH; i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) - __set_bit(i, ret.high_slices); + __set_bit(i, ret->high_slices); } - - return ret; } static int slice_check_fit(struct slice_mask mask, struct slice_mask available) @@ -457,7 +450,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - good_mask = slice_mask_for_size(mm, psize); + slice_mask_for_size(mm, psize, &good_mask); slice_print_mask(" good_mask", good_mask); /* @@ -482,7 +475,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, #ifdef CONFIG_PPC_64K_PAGES /* If we support combo pages, we can allow 64k pages in 4k slices */ if (psize == MMU_PAGE_64K) { - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); if (fixed) slice_or_mask(&good_mask, &compat_mask); } @@ -491,7 +484,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First
[PATCH V2 3/7] powerpc/mm/hash: Move kernel context to the starting of context range
With current kernel, we use the top 4 context for the kernel. Kernel VSIDs are built using these top context values and effective segemnt ID. In the following patches, we want to increase the max effective address to 512TB. We achieve that by increasing the effective segments IDs there by increasing virtual address range. We will be switching to a 68bit virtual address in the following patch. But for platforms like p4 and p5, which only support a 65 bit va, we want to limit the virtual addrress to a 65 bit value. We do that by limiting the context bits to 16 instead of 19. That means we will have different max context values on different platforms. To make this simpler. we move the kernel context to the starting of the range. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 39 ++-- arch/powerpc/include/asm/mmu_context.h| 2 +- arch/powerpc/kvm/book3s_64_mmu_host.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 5 -- arch/powerpc/mm/mmu_context_book3s64.c| 88 ++- arch/powerpc/mm/slb_low.S | 20 ++ 6 files changed, 84 insertions(+), 72 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 8720a406bbbe..0a86cb2022d2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -496,10 +496,10 @@ extern void slb_set_size(u16 size); * For user processes max context id is limited to ((1ul << 19) - 5) * for kernel space, we use the top 4 context ids to map address as below * NOTE: each context only support 64TB now. - * 0x7fffc - [ 0xc000 - 0xc0003fff ] - * 0x7fffd - [ 0xd000 - 0xd0003fff ] - * 0x7fffe - [ 0xe000 - 0xe0003fff ] - * 0x7 - [ 0xf000 - 0xf0003fff ] + * 0x0 - [ 0xc000 - 0xc0003fff ] + * 0x1 - [ 0xd000 - 0xd0003fff ] + * 0x2 - [ 0xe000 - 0xe0003fff ] + * 0x3 - [ 0xf000 - 0xf0003fff ] * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -513,15 +513,9 @@ extern void slb_set_size(u16 size); * robust scattering in the hash table (at least based on some initial * results). * - * We also consider VSID 0 special. We use VSID 0 for slb entries mapping - * bad address. This enables us to consolidate bad address handling in - * hash_page. - * * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1f. That will result in a VSID 0 - * because of the modulo operation in vsid scramble. But the vmemmap - * (which is what uses region 0xf) will never be close to 64TB in size - * (it's 56 bytes per page of system memory). + * because of the modulo operation in vsid scramble. */ #define CONTEXT_BITS 19 @@ -533,12 +527,15 @@ extern void slb_set_size(u16 size); /* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments - * available for user + kernel mapping. The top 4 contexts are used for + * available for user + kernel mapping. The bottom 4 contexts are used for * kernel mapping. Each segment contains 2^28 bytes. Each - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts - * (19 == 37 + 28 - 46). + * context maps 2^46 bytes (64TB). + * + * We also need to avoid the last segment of the last context, because that + * would give a protovsid of 0x1f. That will result in a VSID 0 + * because of the modulo operation in vsid scramble. */ -#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) +#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) /* * This should be computed such that protovosid * vsid_mulitplier @@ -674,19 +671,19 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, * This is only valid for addresses >= PAGE_OFFSET * * For kernel space, we use the top 4 context ids to map address as below - * 0x7fffc - [ 0xc000 - 0xc0003fff ] - * 0x7fffd - [ 0xd000 - 0xd0003fff ] - * 0x7fffe - [ 0xe000 - 0xe0003fff ] - * 0x7 - [ 0xf000 - 0xf0003fff ] + * 0x0 - [ 0xc000 - 0xc0003fff ] + * 0x1 - [ 0xd000 - 0xd0003fff ] + * 0x2 - [ 0xe000 - 0xe0003fff ] + * 0x3 - [ 0xf000 - 0xf0003fff ] */ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) { unsigned long context; /* -* kernel take the top 4 context from the available range +* kernel take the first 4 context from the available range */ - context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; + context = (ea >> 60) -
[PATCH V2 4/7] powerpc/mm/hash: Support 68 bit VA
Inorder to support large effective address range (512TB), we want to increase the virtual address bits to 68. But we do have platforms like p4 and p5 that can only do 65 bit VA. We support those platforms by limiting context bits on them to 16. The protovsid -> vsid conversion is verified to work with both 65 and 68 bit va values. I also documented the restrictions in a table format as part of code comments. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 123 -- arch/powerpc/include/asm/mmu.h| 19 ++-- arch/powerpc/kvm/book3s_64_mmu_host.c | 8 +- arch/powerpc/mm/mmu_context_book3s64.c| 8 +- arch/powerpc/mm/slb_low.S | 54 +-- 5 files changed, 150 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 0a86cb2022d2..c7df4d60744e 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -42,6 +42,7 @@ /* Bits in the SLB VSID word */ #define SLB_VSID_SHIFT 12 +#define SLB_VSID_SHIFT_256M12 #define SLB_VSID_SHIFT_1T 24 #define SLB_VSID_SSIZE_SHIFT 62 #define SLB_VSID_B ASM_CONST(0xc000) @@ -518,9 +519,19 @@ extern void slb_set_size(u16 size); * because of the modulo operation in vsid scramble. */ +/* + * Max Va bits we support as of now is 68 bits. We want 19 bit + * context ID. + * Restrictions: + * GPU has restrictions of not able to access beyond 128TB + * (47 bit effective address). We also cannot do more than 20bit PID. + * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS + * to 16 bits (ie, we can only have 2^16 pids at the same time). + */ +#define VA_BITS68 #define CONTEXT_BITS 19 -#define ESID_BITS 18 -#define ESID_BITS_1T 6 +#define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS)) +#define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS)) #define ESID_BITS_MASK ((1 << ESID_BITS) - 1) #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) @@ -529,62 +540,54 @@ extern void slb_set_size(u16 size); * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments * available for user + kernel mapping. The bottom 4 contexts are used for * kernel mapping. Each segment contains 2^28 bytes. Each - * context maps 2^46 bytes (64TB). + * context maps 2^49 bytes (512TB). * * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1f. That will result in a VSID 0 * because of the modulo operation in vsid scramble. */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) +/* + * For platforms that support on 65bit VA we limit the context bits + */ +#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2) /* * This should be computed such that protovosid * vsid_mulitplier * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus + * We also need to make sure that number of bits in divisor is less + * than twice the number of protovsid bits for our modulus optmization to work. + * The below table shows the current values used. + * + * |---++++--| + * | | Prime Bits | VSID_BITS_65VA | Total Bits | 2* VSID_BITS | + * |---++++--| + * | 1T| 24 | 25 | 49 | 50 | + * |---++++--| + * | 256MB | 24 | 37 | 61 | 74 | + * |---++++--| + * + * |---++++--| + * | | Prime Bits | VSID_BITS_68VA | Total Bits | 2* VSID_BITS | + * |---++++--| + * | 1T| 24 | 28 | 52 | 56 | + * |---++++--| + * | 256MB | 24 | 40 | 64 | 80 | + * |---++++--| + * */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) +#define VSID_BITS_256M (VA_BITS - SID_SHIFT) #define VSID_MODULUS_256M ((1UL<= \ -* 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ -* the bit clear, r3 already has the answer we want, if it \ -* doesn't, the answer is the low 36 bits of r3+1. So in all \ -* cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\ - addirx,rt,1;\ -
[PATCH V2 5/7] powerpc/mm: Move copy_mm_to_paca to paca.c
We will be updating this later to use struct mm_struct. Move this so that function finds the definition of struct mm_struct; Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/paca.h | 18 +- arch/powerpc/kernel/paca.c | 19 +++ arch/powerpc/mm/hash_utils_64.c | 4 ++-- arch/powerpc/mm/slb.c | 2 +- arch/powerpc/mm/slice.c | 2 +- 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6a6792bb39fb..f25d3c93a30f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -207,23 +207,7 @@ struct paca_struct { #endif }; -#ifdef CONFIG_PPC_BOOK3S -static inline void copy_mm_to_paca(mm_context_t *context) -{ - get_paca()->mm_ctx_id = context->id; -#ifdef CONFIG_PPC_MM_SLICES - get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, SLICE_ARRAY_SIZE); -#else - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif -} -#else -static inline void copy_mm_to_paca(mm_context_t *context){} -#endif - +extern void copy_mm_to_paca(struct mm_struct *mm); extern struct paca_struct *paca; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index fa20060ff7a5..b64daf124fee 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -244,3 +244,22 @@ void __init free_unused_pacas(void) free_lppacas(); } + +void copy_mm_to_paca(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_BOOK3S + mm_context_t *context = &mm->context; + + get_paca()->mm_ctx_id = context->id; +#ifdef CONFIG_PPC_MM_SLICES + get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; + memcpy(&get_paca()->mm_ctx_high_slices_psize, + &context->high_slices_psize, SLICE_ARRAY_SIZE); +#else /* CONFIG_PPC_MM_SLICES */ + get_paca()->mm_ctx_user_psize = context->user_psize; + get_paca()->mm_ctx_sllp = context->sllp; +#endif +#else /* CONFIG_PPC_BOOK3S */ + return; +#endif +} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 978314b6b8d7..67937a6eb541 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1084,7 +1084,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) copro_flush_all_slbs(mm); if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } @@ -1156,7 +1156,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, { if (user_region) { if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } else if (get_paca()->vmalloc_sllp != diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 48fc28bab544..15157b14b0b6 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -227,7 +227,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) asm volatile("slbie %0" : : "r" (slbie_data)); get_paca()->slb_cache_ptr = 0; - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); /* * preload some userspace segments into the SLB. diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 0ec750adbe7f..116868bb91f5 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -190,7 +190,7 @@ static void slice_flush_segments(void *parm) if (mm != current->active_mm) return; - copy_mm_to_paca(¤t->active_mm->context); + copy_mm_to_paca(current->active_mm); local_irq_save(flags); slb_flush_and_rebolt(); -- 2.7.4
[PATCH V2 7/7] powerpc/mm/hash: Increase VA range to 128TB
We update the hash linux page table layout such that we can support 512TB. But we limit the TASK_SIZE to 128TB. We can switch to 128TB by default without conditional because that is the max virtual address supported by other architectures. We will later add a mechanism to on-demand increase the application's effective address range to 512TB. Having the page table layout changed to accommodate 512TB makes testing large memory configuration easier with less code changes to kernel Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 +- arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 +- arch/powerpc/include/asm/page_64.h| 2 +- arch/powerpc/include/asm/processor.h | 22 ++ arch/powerpc/kernel/paca.c| 9 - arch/powerpc/mm/slice.c | 2 ++ 6 files changed, 31 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 0c4e470571ca..b4b5e6b671ca 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -8,7 +8,7 @@ #define H_PTE_INDEX_SIZE 9 #define H_PMD_INDEX_SIZE 7 #define H_PUD_INDEX_SIZE 9 -#define H_PGD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 12 #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index b39f0b86405e..682c4eb28fa4 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -4,7 +4,7 @@ #define H_PTE_INDEX_SIZE 8 #define H_PMD_INDEX_SIZE 5 #define H_PUD_INDEX_SIZE 5 -#define H_PGD_INDEX_SIZE 12 +#define H_PGD_INDEX_SIZE 15 /* * 64k aligned address free up few of the lower bits of RPN for us diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 7f72659b7999..9b60e9455c6e 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -107,7 +107,7 @@ extern u64 ppc64_pft_size; */ struct slice_mask { u16 low_slices; - DECLARE_BITMAP(high_slices, 64); + DECLARE_BITMAP(high_slices, 512); }; struct mm_struct; diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 1ba814436c73..1d4e34f9004d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -102,11 +102,25 @@ void release_thread(struct task_struct *); #endif #ifdef CONFIG_PPC64 -/* 64-bit user address space is 46-bits (64TB user VM) */ -#define TASK_SIZE_USER64 (0x4000UL) +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x4000UL) +#define TASK_SIZE_128TB (0x8000UL) +#define TASK_SIZE_512TB (0x0002UL) -/* - * 32-bit user address space is 4GB - 1 page +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * MAx value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_128TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#endif + +/* + * 32-bit user address space is 4GB - 1 page * (this 1 page is needed so referencing of 0x generates EFAULT */ #define TASK_SIZE_USER32 (0x0001UL - (1*PAGE_SIZE)) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index b64daf124fee..c7ca70dc3ba5 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -253,8 +253,15 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; + /* +* We support upto 128TB for now. Hence copy only 128/2 bytes. +* Later when we support tasks with different max effective +* address, we can optimize this based on mm->task_size. +*/ + BUILD_BUG_ON(TASK_SIZE_USER64 != TASK_SIZE_128TB); memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, SLICE_ARRAY_SIZE); + &context->high_slices_psize, TASK_SIZE_128TB >> 41); + #else /* CONFIG_PPC_MM_SLICES */ get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 116868bb91f5..b3f45e413a60 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -407,6 +407,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, struct mm_struct *mm = current->mm; unsigned long newaddr; + /* Make sure high_slices bitmap size is same as we expected */ + BUILD_BUG_ON(512 != SLICE_NUM_HIGH); /* * init different masks */ -- 2.7.4
[PATCH V2 6/7] powerpc/mm: Remove redundant TASK_SIZE_USER64 checks
The check against VSID range is implied when we check task size against hash and radix pgtable range[1], because we make sure page table range cannot exceed vsid range. [1] BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); The check for smaller task size is also removed here, because the follow up patch will support a tasksize smaller than pgtable range. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/init_64.c| 4 arch/powerpc/mm/pgtable_64.c | 5 - 2 files changed, 9 deletions(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 93abf8a9813d..f3e856e6ee23 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -69,10 +69,6 @@ #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif - -#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif #endif /* CONFIG_PPC_STD_MMU_64 */ phys_addr_t memstart_addr = ~0; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8bca7f58afc4..06e23e0b1b81 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,11 +55,6 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 -#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) -#error TASK_SIZE_USER64 exceeds user VSID range -#endif -#endif #ifdef CONFIG_PPC_BOOK3S_64 /* -- 2.7.4
[PATCH] powerpc/xmon: add turn off xmon option
Once xmon is triggered, there is no interface to turn it off again. However there exists disable/enable xmon code flows. And more important, System reset interrupt on powerVM will fire an oops to make a dump. At that time, xmon should not be triggered. So add 'z' option after current 'x|X' exit commands. Turn xmon off if 'z' is following. Signed-off-by: Pan Xinhui --- arch/powerpc/xmon/xmon.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 9c0e17c..2f4e7b1 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -76,6 +76,7 @@ static int xmon_gate; #endif /* CONFIG_SMP */ static unsigned long in_xmon __read_mostly = 0; +static int xmon_off = 0; static unsigned long adrs; static int size = 1; @@ -255,8 +256,8 @@ Commands:\n\ Sr # read SPR #\n\ Sw #v write v to SPR #\n\ tprint backtrace\n\ - xexit monitor and recover\n\ - Xexit monitor and don't recover\n" + x[z] exit monitor and recover, turn off xmon with 'z'\n\ + X[z] exit monitor and don't recover, turn off xmon with 'z'\n" #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E) " u dump segment table or SLB\n" #elif defined(CONFIG_PPC_STD_MMU_32) @@ -952,6 +953,8 @@ cmds(struct pt_regs *excp) break; case 'x': case 'X': + if (inchar() == 'z') + xmon_off = 1; return cmd; case EOF: printf(" \n"); @@ -3248,8 +3251,11 @@ static void xmon_init(int enable) static void sysrq_handle_xmon(int key) { /* ensure xmon is enabled */ + xmon_off = 0; xmon_init(1); debugger(get_irq_regs()); + if (xmon_off) + xmon_init(0); } static struct sysrq_key_op sysrq_xmon_op = { @@ -3266,7 +3272,7 @@ static int __init setup_xmon_sysrq(void) __initcall(setup_xmon_sysrq); #endif /* CONFIG_MAGIC_SYSRQ */ -static int __initdata xmon_early, xmon_off; +static int __initdata xmon_early; static int __init early_parse_xmon(char *p) { -- 2.4.11
Re: [PATCH 1/2] powerpc/powernv/opal-dump : Handles opal_dump_info properly
Hi Micheal, Can you please look at this patchset? -Mukesh On Tuesday 06 December 2016 12:07 PM, Mukesh Ojha wrote: Hi Michael, Can you please have a look at this patchset as there is no functional changes involve with this? Thanks, Mukesh On Thursday 01 December 2016 02:38 PM, Mukesh Ojha wrote: Moves the return value check of 'opal_dump_info' to a proper place which was previously unnecessarily filling all the dump info even on failure. Signed-off-by: Mukesh Ojha --- arch/powerpc/platforms/powernv/opal-dump.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 4c82782..ae32212 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -225,13 +225,16 @@ static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t * if (rc == OPAL_PARAMETER) rc = opal_dump_info(&id, &size); +if (rc) { +pr_warn("%s: Failed to get dump info (%d)\n", +__func__, rc); +return rc; +} + *dump_id = be32_to_cpu(id); *dump_size = be32_to_cpu(size); *dump_type = be32_to_cpu(type); -if (rc) -pr_warn("%s: Failed to get dump info (%d)\n", -__func__, rc); return rc; }