On 03/07/2013 05:50 AM, Cliff Wickman wrote:
From: Cliff Wickman <c...@sgi.com>

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time.

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages.  They would put this on the kernel boot line:
    default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
  up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:

How you confirm they are done by cpu 0? just cpu 0 works during boot?

       start_kernel
         kernel_init
           do_pre_smp_initcalls
             hugetlb_init
               hugetlb_init_hstates
                 hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating.  32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done.  The 'no zeroing' flag would have to be passed
down this code path:

   hugetlb_hstate_alloc_pages
     alloc_bootmem_huge_page
       __alloc_bootmem_node_nopanic NO_ZERO  (nobootmem.c)
         __alloc_memory_core_early  NO_ZERO
          if (!(flags & NO_ZERO))
             memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

   hugetlb_hstate_alloc_pages
     alloc_bootmem_huge_page
       __alloc_bootmem_node_nopanic  NO_ZERO  (bootmem.c)
         alloc_bootmem_core          NO_ZERO
          if (!(flags & NO_ZERO))
             memset(region, 0, size);
         __alloc_bootmem_nopanic     NO_ZERO
           ___alloc_bootmem_nopanic  NO_ZERO
             alloc_bootmem_core      NO_ZERO
              if (!(flags & NO_ZERO))
                 memset(region, 0, size);

Signed-off-by: Cliff Wickman <c...@sgi.com>

---
  arch/x86/kernel/setup_percpu.c |    4 ++--
  include/linux/bootmem.h        |   23 ++++++++++++++++-------
  mm/bootmem.c                   |   12 +++++++-----
  mm/hugetlb.c                   |    3 ++-
  mm/nobootmem.c                 |   41 
+++++++++++++++++++++++------------------
  mm/page_cgroup.c               |    2 +-
  mm/sparse.c                    |    2 +-
  7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
  #include <asm/dma.h>
/*
+ * allocation flags
+ */
+#define NO_ZERO                0x00000001
+
+/*
   *  simple boot-time physical memory area allocator.
   */
@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
                             unsigned long goal);
  extern void *__alloc_bootmem_nopanic(unsigned long size,
                                     unsigned long align,
-                                    unsigned long goal);
+                                    unsigned long goal,
+                                    u32 flags);
  extern void *__alloc_bootmem_node(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
  extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
-                                 unsigned long goal);
+                                 unsigned long goal,
+                                 u32 flags);
  void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
                                  unsigned long goal,
-                                 unsigned long limit);
+                                 unsigned long limit,
+                                 u32 flags);
  extern void *__alloc_bootmem_low(unsigned long size,
                                 unsigned long align,
                                 unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
  #define alloc_bootmem_align(x, align) \
        __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_nopanic(x) \
-       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+       __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
  #define alloc_bootmem_pages(x) \
        __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_pages_nopanic(x) \
-       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+       __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
  #define alloc_bootmem_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_node_nopanic(pgdat, x) \
-       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, 
BOOTMEM_LOW_LIMIT)
+       __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
+                                    BOOTMEM_LOW_LIMIT, 0)
  #define alloc_bootmem_pages_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
  #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
-       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
#define alloc_bootmem_low(x) \
        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
Index: linux/arch/x86/kernel/setup_percpu.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_percpu.c
+++ linux/arch/x86/kernel/setup_percpu.c
@@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
        void *ptr;
if (!node_online(node) || !NODE_DATA(node)) {
-               ptr = __alloc_bootmem_nopanic(size, align, goal);
+               ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
                         cpu, size, __pa(ptr));
        } else {
                ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
-                                                  size, align, goal);
+                                                  size, align, goal, 0);
                pr_debug("per cpu data for cpu%d %lu bytes on node%d at 
%016lx\n",
                         cpu, size, node, __pa(ptr));
        }
Index: linux/mm/nobootmem.c
===================================================================
--- linux.orig/mm/nobootmem.c
+++ linux/mm/nobootmem.c
@@ -33,7 +33,7 @@ unsigned long min_low_pfn;
  unsigned long max_pfn;
static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit)
+                                       u64 goal, u64 limit, u32 flags)
  {
        void *ptr;
        u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
                return NULL;
ptr = phys_to_virt(addr);
-       memset(ptr, 0, size);
+       if (!(flags & NO_ZERO))
+               memset(ptr, 0, size);
        memblock_reserve(addr, size);
        /*
         * The min_count is set to 0 so that bootmem allocated blocks
@@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
  static void * __init ___alloc_bootmem_nopanic(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
-                                       unsigned long limit)
+                                       unsigned long limit,
+                                       u32 flags)
  {
        void *ptr;
@@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no restart: - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+       ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
+                                       limit, 0);
if (ptr)
                return ptr;
@@ -244,17 +247,17 @@ restart:
   * Returns NULL on failure.
   */
  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-                                       unsigned long goal)
+                                       unsigned long goal, u32 flags)
  {
        unsigned long limit = -1UL;
- return ___alloc_bootmem_nopanic(size, align, goal, limit);
+       return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
  }
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
-                                       unsigned long goal, unsigned long limit)
+                       unsigned long goal, unsigned long limit, u32 flags)
  {
-       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+       void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
if (mem)
                return mem;
@@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
  {
        unsigned long limit = -1UL;
- return ___alloc_bootmem(size, align, goal, limit);
+       return ___alloc_bootmem(size, align, goal, limit, 0);
  }
void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                                   unsigned long size,
                                                   unsigned long align,
                                                   unsigned long goal,
-                                                  unsigned long limit)
+                                                  unsigned long limit,
+                                                  u32 flags)
  {
        void *ptr;
again:
        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-                                       goal, limit);
+                                       goal, limit, flags);
        if (ptr)
                return ptr;
ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                       goal, limit);
+                                       goal, limit, flags);
        if (ptr)
                return ptr;
@@ -315,12 +319,13 @@ again:
  }
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-                                  unsigned long align, unsigned long goal)
+                       unsigned long align, unsigned long goal, u32 flags)
  {
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+                       0, flags);
  }
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
  {
        void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
        if (ptr)
                return ptr;
@@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
   * The function panics if the request can not be satisfied.
   */
  void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
-                                  unsigned long align, unsigned long goal)
+                       unsigned long align, unsigned long goal)
  {
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
@@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
  void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
                                  unsigned long goal)
  {
-       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+       return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
  }
void * __init __alloc_bootmem_low_nopanic(unsigned long size,
@@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
                                          unsigned long goal)
  {
        return ___alloc_bootmem_nopanic(size, align, goal,
-                                       ARCH_LOW_ADDRESS_LIMIT);
+                                       ARCH_LOW_ADDRESS_LIMIT, 0);
  }
/**
Index: linux/mm/sparse.c
===================================================================
--- linux.orig/mm/sparse.c
+++ linux/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
  again:
        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
-                                         SMP_CACHE_BYTES, goal, limit);
+                                         SMP_CACHE_BYTES, goal, limit, 0);
        if (!p && limit) {
                limit = 0;
                goto again;
Index: linux/mm/hugetlb.c
===================================================================
--- linux.orig/mm/hugetlb.c
+++ linux/mm/hugetlb.c
@@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
                addr = __alloc_bootmem_node_nopanic(
                                NODE_DATA(hstate_next_node_to_alloc(h,
                                                &node_states[N_MEMORY])),
-                               huge_page_size(h), huge_page_size(h), 0);
+                               huge_page_size(h), huge_page_size(h),
+                               0, NO_ZERO);
if (addr) {
                        /*
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -660,7 +660,7 @@ restart:
   * Returns NULL on failure.
   */
  void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
-                                       unsigned long goal)
+                                       unsigned long goal, u32 flags)
  {
        unsigned long limit = 0;
@@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
-                               unsigned long goal, unsigned long limit)
+                               unsigned long goal, unsigned long limit,
+                               u32 flags)
  {
        void *ptr;
@@ -734,12 +735,13 @@ again:
  }
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-                                  unsigned long align, unsigned long goal)
+                       unsigned long align, unsigned long goal, u32 flags)
  {
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+       return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+                                            0, flags);
  }
void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
  {
        void *ptr;
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+       ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
        if (ptr)
                return ptr;
Index: linux/mm/page_cgroup.c
===================================================================
--- linux.orig/mm/page_cgroup.c
+++ linux/mm/page_cgroup.c
@@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
        table_size = sizeof(struct page_cgroup) * nr_pages;
base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                       table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_cgroup = base;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"d...@kvack.org";> em...@kvack.org </a>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to