On Sun, Mar 24, 2019 at 03:29:04PM +0100, Thomas Gleixner wrote: >Wei, > >On Tue, 12 Feb 2019, Wei Yang wrote: >> >> This patch changes the implementation from the first perception to the >> second to reduce one different handling on end_pfn. After doing so, the >> code is easier to read. > >It's maybe slightly easier to read, but it's still completely unreadable >garbage. > > Not your fault, it was garbage before. > >But refining garbage still results in garbage. Just the smell is slightly >different. > >Why? > > 1) Doing all the calculations PFN based is just a pointless > indirection. Just look at all the rounding magic and back and forth > conversions. > > All of that can be done purely address/size based which makes the code > truly readable. > > 2) The 5(3) sections are more or less copied code with a few changes of > constants, except for the first section (A) which has an extra quirk > for 32bit. Plus all the 64bit vs. 32bit #ifdeffery which is not making > it any better. > > This copied mess can be avoided by using helper functions and proper > loops. > > 3) During the bootmem phase the code tries to preserve large mappings > _AFTER_ splitting them up and then it tries to merge the resulting > overlaps. > > This is completely backwards because the expansion of the range can be > tried right away when then mapping of a large page is attempted. Surely > not with the current mess, but with a proper loop based approach it can > be done nicely. > > Btw, there is a bug in that expansion code which could result in > undoing the enforced 4K mapping of the lower 2M/4M range on 32bit. It's > probably not an issue in practice because the low range is usually not > contiguous. But it works by chance not by design. > > 4) The debug print string retrieval function is just silly. > >The logic for rewriting this is pretty obvious: > > while (addr < end) { > setup_mr(mr, addr, end); > for_each_map_size(1G, 2M, 4K) { > if (try_map(mr, size)) > break; > } > addr = mr->end; > } > > setup_mr() takes care of the 32bit 0-2/4M range by limiting the > range and setting the allowed size mask in mr to 4k only. > > try_map() does: > > 1) Try to expand the range to preserve large pages during bootmem > > 2) If the range start is not aligned, limit the end to the large > size boundary, so the next lower map size will only cover the > unaligned portion. > > 3) If the range end is not aligned, fit as much large > size as possible. > > No magic A-E required, because it just falls into place naturally and > the expansion is done at the right place and not after the fact. > >Find a mostly untested patch which implements this below. I just booted it >in a 64bit guest and it did not explode. > >It removes 55 lines of code instead of adding 35 and reduces the binary >size by 408 bytes on 64bit and 128 bytes on 32bit. > >Note, it's a combo of changes (including your patch 1/6) and needs to be >split up. It would be nice if you have time to split it up into separate >patches, add proper changelogs and test the heck out of it on both 32 and >64 bit. If you don't have time, please let me know. >
Thanks for your suggestions :-) Just get my head up, will try to understand the code and test on both arch. BTW, do you have some suggestions in the test? Currently I just use bootup test. Basicly I think this is fine to cover the cases. Maybe you would have some better idea. >Thanks, > > tglx > >8<--------------- > arch/x86/mm/init.c | 259 > ++++++++++++++++++++--------------------------------- > 1 file changed, 102 insertions(+), 157 deletions(-) > >--- a/arch/x86/mm/init.c >+++ b/arch/x86/mm/init.c >@@ -157,17 +157,28 @@ void __init early_alloc_pgt_buf(void) > pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); > } > >-int after_bootmem; >+int after_bootmem __ro_after_init; > > early_param_on_off("gbpages", "nogbpages", direct_gbpages, > CONFIG_X86_DIRECT_GBPAGES); > > struct map_range { >- unsigned long start; >- unsigned long end; >- unsigned page_size_mask; >+ unsigned long start; >+ unsigned long end; >+ unsigned int page_size_mask; > }; > >-static int page_size_mask; >+#ifdef CONFIG_X86_32 >+#define NR_RANGE_MR 3 >+#else >+#define NR_RANGE_MR 5 >+#endif >+ >+struct mapinfo { >+ unsigned int mask; >+ unsigned int size; >+}; >+ >+static unsigned int page_size_mask __ro_after_init; > > static void __init probe_page_size_mask(void) > { >@@ -177,7 +188,7 @@ static void __init probe_page_size_mask( > * large pages into small in interrupt context, etc. > */ > if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) >- page_size_mask |= 1 << PG_LEVEL_2M; >+ page_size_mask |= 1U << PG_LEVEL_2M; > else > direct_gbpages = 0; > >@@ -201,7 +212,7 @@ static void __init probe_page_size_mask( > /* Enable 1 GB linear kernel mappings if available: */ > if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { > printk(KERN_INFO "Using GB pages for direct mapping\n"); >- page_size_mask |= 1 << PG_LEVEL_1G; >+ page_size_mask |= 1U << PG_LEVEL_1G; > } else { > direct_gbpages = 0; > } >@@ -249,185 +260,119 @@ static void setup_pcid(void) > } > } > >-#ifdef CONFIG_X86_32 >-#define NR_RANGE_MR 3 >-#else /* CONFIG_X86_64 */ >-#define NR_RANGE_MR 5 >+static void __meminit mr_print(struct map_range *mr, unsigned int maxidx) >+{ >+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) >+ static const char *sz[2] = { "4k", "4M" }; >+#else >+ static const char *sz[4] = { "4k", "2M", "1G", "" }; > #endif >+ unsigned int idx, s; > >-static int __meminit save_mr(struct map_range *mr, int nr_range, >- unsigned long start_pfn, unsigned long end_pfn, >- unsigned long page_size_mask) >-{ >- if (start_pfn < end_pfn) { >- if (nr_range >= NR_RANGE_MR) >- panic("run out of range for init_memory_mapping\n"); >- mr[nr_range].start = start_pfn<<PAGE_SHIFT; >- mr[nr_range].end = end_pfn<<PAGE_SHIFT; >- mr[nr_range].page_size_mask = page_size_mask; >- nr_range++; >+ for (idx = 0; idx < maxidx; idx++, mr++) { >+ s = (mr->page_size_mask >> PG_LEVEL_2M) & (ARRAY_SIZE(sz) -1); >+ pr_debug(" [mem %#010lx-%#010lx] page size %s\n", >+ mr->start, mr->end - 1, sz[s]); > } >- >- return nr_range; > } > > /* >- * adjust the page_size_mask for small range to go with >- * big page size instead small one if nearby are ram too. >+ * Try to preserve large mappings during bootmem by expanding the current >+ * range to large page mapping of @size and verifying that the result is >+ * within a memory region. > */ >-static void __ref adjust_range_page_size_mask(struct map_range *mr, >- int nr_range) >+static void __meminit mr_expand(struct map_range *mr, unsigned int size) > { >- int i; >- >- for (i = 0; i < nr_range; i++) { >- if ((page_size_mask & (1<<PG_LEVEL_2M)) && >- !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { >- unsigned long start = round_down(mr[i].start, PMD_SIZE); >- unsigned long end = round_up(mr[i].end, PMD_SIZE); >+ unsigned long start = round_down(mr->start, size); >+ unsigned long end = round_up(mr->end, size); > >-#ifdef CONFIG_X86_32 >- if ((end >> PAGE_SHIFT) > max_low_pfn) >- continue; >-#endif >+ if (IS_ENABLED(CONFIG_X86_32) && (end >> PAGE_SHIFT) > max_low_pfn) >+ return; > >- if (memblock_is_region_memory(start, end - start)) >- mr[i].page_size_mask |= 1<<PG_LEVEL_2M; >- } >- if ((page_size_mask & (1<<PG_LEVEL_1G)) && >- !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { >- unsigned long start = round_down(mr[i].start, PUD_SIZE); >- unsigned long end = round_up(mr[i].end, PUD_SIZE); >- >- if (memblock_is_region_memory(start, end - start)) >- mr[i].page_size_mask |= 1<<PG_LEVEL_1G; >- } >+ if (memblock_is_region_memory(start, end - start)) { >+ mr->start = start; >+ mr->end = end; > } > } > >-static const char *page_size_string(struct map_range *mr) >+static bool __meminit mr_try_map(struct map_range *mr, const struct mapinfo >*mi) > { >- static const char str_1g[] = "1G"; >- static const char str_2m[] = "2M"; >- static const char str_4m[] = "4M"; >- static const char str_4k[] = "4k"; >+ unsigned long len; > >- if (mr->page_size_mask & (1<<PG_LEVEL_1G)) >- return str_1g; >- /* >- * 32-bit without PAE has a 4M large page size. >- * PG_LEVEL_2M is misnamed, but we can at least >- * print out the right size in the string. >- */ >- if (IS_ENABLED(CONFIG_X86_32) && >- !IS_ENABLED(CONFIG_X86_PAE) && >- mr->page_size_mask & (1<<PG_LEVEL_2M)) >- return str_4m; >+ /* Check whether the map size is supported. PAGE_SIZE always is. */ >+ if (mi->mask && !(mr->page_size_mask & mi->mask)) >+ return false; > >- if (mr->page_size_mask & (1<<PG_LEVEL_2M)) >- return str_2m; >+ if (!after_bootmem) >+ mr_expand(mr, mi->size); > >- return str_4k; >-} >+ if (!IS_ALIGNED(mr->start, mi->size)) { >+ /* Limit the range to the next boundary of this size. */ >+ mr->end = min_t(unsigned long, mr->end, >+ round_up(mr->start, mi->size)); >+ return false; >+ } > >-static int __meminit split_mem_range(struct map_range *mr, int nr_range, >- unsigned long start, >- unsigned long end) >-{ >- unsigned long start_pfn, end_pfn, limit_pfn; >- unsigned long pfn; >- int i; >+ if (!IS_ALIGNED(mr->end, mi->size)) { >+ /* Try to fit as much as possible */ >+ len = round_down(mr->end - mr->start, mi->size); >+ if (!len) >+ return false; >+ mr->end = mr->start + len; >+ } > >- limit_pfn = PFN_DOWN(end); >+ /* Store the effective page size mask */ >+ mr->page_size_mask = mi->mask; >+ return true; >+} > >- /* head if not big page alignment ? */ >- pfn = start_pfn = PFN_DOWN(start); >-#ifdef CONFIG_X86_32 >+static void __meminit mr_setup(struct map_range *mr, unsigned long start, >+ unsigned long end) >+{ > /* >- * Don't use a large page for the first 2/4MB of memory >- * because there are often fixed size MTRRs in there >- * and overlapping MTRRs into large pages can cause >- * slowdowns. >+ * On 32bit the first 2/4MB are often covered by fixed size MTRRs. >+ * Overlapping MTRRs on large pages can cause slowdowns. Force 4k >+ * mappings. > */ >- if (pfn == 0) >- end_pfn = PFN_DOWN(PMD_SIZE); >- else >- end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); >-#else /* CONFIG_X86_64 */ >- end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); >-#endif >- if (end_pfn > limit_pfn) >- end_pfn = limit_pfn; >- if (start_pfn < end_pfn) { >- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); >- pfn = end_pfn; >- } >- >- /* big page (2M) range */ >- start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); >-#ifdef CONFIG_X86_32 >- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); >-#else /* CONFIG_X86_64 */ >- end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); >- if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) >- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); >-#endif >- >- if (start_pfn < end_pfn) { >- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, >- page_size_mask & (1<<PG_LEVEL_2M)); >- pfn = end_pfn; >+ if (IS_ENABLED(CONFIG_X86_32) && start < PMD_SIZE) { >+ mr->page_size_mask = 0; >+ mr->end = min_t(unsigned long, end, PMD_SIZE); >+ } else { >+ /* Set the possible mapping sizes and allow full range. */ >+ mr->page_size_mask = page_size_mask; >+ mr->end = end; > } >+ mr->start = start; >+} > >+static int __meminit split_mem_range(struct map_range *mr, unsigned long >start, >+ unsigned long end) >+{ >+ static const struct mapinfo mapinfos[] = { > #ifdef CONFIG_X86_64 >- /* big page (1G) range */ >- start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); >- end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); >- if (start_pfn < end_pfn) { >- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, >- page_size_mask & >- ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); >- pfn = end_pfn; >- } >- >- /* tail is not big page (1G) alignment */ >- start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); >- end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); >- if (start_pfn < end_pfn) { >- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, >- page_size_mask & (1<<PG_LEVEL_2M)); >- pfn = end_pfn; >- } >+ { .mask = 1U << PG_LEVEL_1G, .size = PUD_SIZE }, > #endif >+ { .mask = 1U << PG_LEVEL_2M, .size = PMD_SIZE }, >+ { .mask = 0, .size = PAGE_SIZE }, >+ }; >+ const struct mapinfo *mi; >+ struct map_range *curmr; >+ unsigned long addr; >+ int idx; >+ >+ for (idx = 0, addr = start, curmr = mr; addr < end; idx++, curmr++) { >+ BUG_ON(idx == NR_RANGE_MR); > >- /* tail is not big page (2M) alignment */ >- start_pfn = pfn; >- end_pfn = limit_pfn; >- nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); >+ mr_setup(curmr, addr, end); > >- if (!after_bootmem) >- adjust_range_page_size_mask(mr, nr_range); >+ /* Try map sizes top down. PAGE_SIZE will always succeed. */ >+ for (mi = mapinfos; !mr_try_map(curmr, mi); mi++); > >- /* try to merge same page size and continuous */ >- for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { >- unsigned long old_start; >- if (mr[i].end != mr[i+1].start || >- mr[i].page_size_mask != mr[i+1].page_size_mask) >- continue; >- /* move it */ >- old_start = mr[i].start; >- memmove(&mr[i], &mr[i+1], >- (nr_range - 1 - i) * sizeof(struct map_range)); >- mr[i--].start = old_start; >- nr_range--; >+ /* Get the start address for the next range */ >+ addr = curmr->end; > } >- >- for (i = 0; i < nr_range; i++) >- pr_debug(" [mem %#010lx-%#010lx] page %s\n", >- mr[i].start, mr[i].end - 1, >- page_size_string(&mr[i])); >- >- return nr_range; >+ mr_print(mr, idx); >+ return idx; > } > > struct range pfn_mapped[E820_MAX_ENTRIES]; >@@ -474,7 +419,7 @@ unsigned long __ref init_memory_mapping( > start, end - 1); > > memset(mr, 0, sizeof(mr)); >- nr_range = split_mem_range(mr, 0, start, end); >+ nr_range = split_mem_range(mr, start, end); > > for (i = 0; i < nr_range; i++) > ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- Wei Yang Help you, Help me