Author: jhb
Date: Tue Jul 27 20:33:50 2010
New Revision: 210550
URL: http://svn.freebsd.org/changeset/base/210550

Log:
  Very rough first cut at NUMA support for the physical page allocator.  For
  now it uses a very dumb first-touch allocation policy.  This will change in
  the future.
  - Each architecture indicates the maximum number of supported memory domains
    via a new VM_NDOMAIN parameter in <machine/vmparam.h>.
  - Each cpu now has a PCPU_GET(domain) member to indicate the memory domain
    a CPU belongs to.  Domain values are dense and numbered from 0.
  - When a platform supports multiple domains, the default freelist
    (VM_FREELIST_DEFAULT) is split up into N freelists, one for each domain.
    The MD code is required to populate an array of mem_affinity structures.
    Each entry in the array defines a range of memory (start and end) and a
    domain for the range.  Multiple entries may be present for a single
    domain.  The list is terminated by an entry where all fields are zero.
    This array of structures is used to split up phys_avail[] regions that
    fall in VM_FREELIST_DEFAULT into per-domain freelists.
  - Each memory domain has a separate lookup-array of freelists that is
    used when fulfulling a physical memory allocation.  Right now the
    per-domain freelists are listed in a round-robin order for each domain.
    In the future a table such as the ACPI SLIT table may be used to order
    the per-domain lookup lists based on the penalty for each memory domain
    relative to a specific domain.  The lookup lists may be examined via a
    new vm.phys.lookup_lists sysctl.
  - The first-touch policy is implemented by using PCPU_GET(domain) to
    pick a lookup list when allocating memory.
  
  Reviewed by:  alc

Modified:
  head/sys/amd64/include/vmparam.h
  head/sys/arm/include/vmparam.h
  head/sys/i386/include/vmparam.h
  head/sys/ia64/include/vmparam.h
  head/sys/mips/include/vmparam.h
  head/sys/powerpc/include/vmparam.h
  head/sys/sparc64/include/vmparam.h
  head/sys/sun4v/include/vmparam.h
  head/sys/sys/pcpu.h
  head/sys/vm/vm_phys.c
  head/sys/vm/vm_phys.h

Modified: head/sys/amd64/include/vmparam.h
==============================================================================
--- head/sys/amd64/include/vmparam.h    Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/amd64/include/vmparam.h    Tue Jul 27 20:33:50 2010        
(r210550)
@@ -132,6 +132,13 @@
 #define        VM_NFREEORDER           13
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Enable superpage reservations: 1 level.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/arm/include/vmparam.h
==============================================================================
--- head/sys/arm/include/vmparam.h      Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/arm/include/vmparam.h      Tue Jul 27 20:33:50 2010        
(r210550)
@@ -86,6 +86,13 @@
 #define        VM_NFREEORDER           9
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/i386/include/vmparam.h
==============================================================================
--- head/sys/i386/include/vmparam.h     Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/i386/include/vmparam.h     Tue Jul 27 20:33:50 2010        
(r210550)
@@ -119,6 +119,13 @@
 #endif
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Enable superpage reservations: 1 level.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/ia64/include/vmparam.h
==============================================================================
--- head/sys/ia64/include/vmparam.h     Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/ia64/include/vmparam.h     Tue Jul 27 20:33:50 2010        
(r210550)
@@ -120,6 +120,13 @@
 #define        VM_NFREEORDER           16
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/mips/include/vmparam.h
==============================================================================
--- head/sys/mips/include/vmparam.h     Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/mips/include/vmparam.h     Tue Jul 27 20:33:50 2010        
(r210550)
@@ -118,6 +118,13 @@
 #endif
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations. (not sure if this is right
  * I copied it from ARM)
  */

Modified: head/sys/powerpc/include/vmparam.h
==============================================================================
--- head/sys/powerpc/include/vmparam.h  Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/powerpc/include/vmparam.h  Tue Jul 27 20:33:50 2010        
(r210550)
@@ -167,6 +167,13 @@ struct pmap_physseg {
 #define        VM_NFREEORDER           11
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/sparc64/include/vmparam.h
==============================================================================
--- head/sys/sparc64/include/vmparam.h  Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/sparc64/include/vmparam.h  Tue Jul 27 20:33:50 2010        
(r210550)
@@ -121,6 +121,13 @@
 #define        VM_NFREEORDER           12
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/sun4v/include/vmparam.h
==============================================================================
--- head/sys/sun4v/include/vmparam.h    Tue Jul 27 19:31:10 2010        
(r210549)
+++ head/sys/sun4v/include/vmparam.h    Tue Jul 27 20:33:50 2010        
(r210550)
@@ -121,6 +121,13 @@
 #define        VM_NFREEORDER           12
 
 /*
+ * Only one memory domain.
+ */
+#ifndef VM_NDOMAIN
+#define        VM_NDOMAIN              1
+#endif
+
+/*
  * Disable superpage reservations.
  */
 #ifndef        VM_NRESERVLEVEL

Modified: head/sys/sys/pcpu.h
==============================================================================
--- head/sys/sys/pcpu.h Tue Jul 27 19:31:10 2010        (r210549)
+++ head/sys/sys/pcpu.h Tue Jul 27 20:33:50 2010        (r210550)
@@ -179,6 +179,7 @@ struct pcpu {
        struct device   *pc_device;
        void            *pc_netisr;             /* netisr SWI cookie */
        int             pc_dnweight;            /* vm_page_dontneed() */
+       int             pc_domain;              /* Memory domain. */
 
        /*
         * Stuff for read mostly lock

Modified: head/sys/vm/vm_phys.c
==============================================================================
--- head/sys/vm/vm_phys.c       Tue Jul 27 19:31:10 2010        (r210549)
+++ head/sys/vm/vm_phys.c       Tue Jul 27 20:33:50 2010        (r210550)
@@ -56,6 +56,13 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_phys.h>
 #include <vm/vm_reserv.h>
 
+/*
+ * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
+ * domain.  These extra lists are stored at the end of the regular
+ * free lists starting with VM_NFREELIST.
+ */
+#define VM_RAW_NFREELIST       (VM_NFREELIST + VM_NDOMAIN - 1)
+
 struct vm_freelist {
        struct pglist pl;
        int lcnt;
@@ -65,15 +72,20 @@ struct vm_phys_seg {
        vm_paddr_t      start;
        vm_paddr_t      end;
        vm_page_t       first_page;
+       int             domain;
        struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
 };
 
+struct mem_affinity *mem_affinity;
+
 static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
 
 static int vm_phys_nsegs;
 
 static struct vm_freelist
-    vm_phys_free_queues[VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
+static struct vm_freelist
+(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
 
 static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
 
@@ -89,6 +101,14 @@ static int sysctl_vm_phys_segs(SYSCTL_HA
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
+#if VM_NDOMAIN > 1
+static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
+#endif
+
+static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
+    int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
@@ -157,6 +177,7 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
                    (uintmax_t)seg->start);
                sbuf_printf(&sbuf, "end:       %#jx\n",
                    (uintmax_t)seg->end);
+               sbuf_printf(&sbuf, "domain:    %d\n", seg->domain);
                sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
        }
        sbuf_finish(&sbuf);
@@ -166,11 +187,40 @@ sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
        return (error);
 }
 
+#if VM_NDOMAIN > 1
+/*
+ * Outputs the set of free list lookup lists.
+ */
+static int
+sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
+{
+       struct sbuf sbuf;
+       char *cbuf;
+       const int cbufsize = (vm_nfreelists + 1) * VM_NDOMAIN * 81;
+       int domain, error, flind, ndomains;
+
+       ndomains = vm_nfreelists - VM_NFREELIST + 1;
+       cbuf = malloc(cbufsize, M_TEMP, M_WAITOK | M_ZERO);
+       sbuf_new(&sbuf, cbuf, cbufsize, SBUF_FIXEDLEN);
+       for (domain = 0; domain < ndomains; domain++) {
+               sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
+               for (flind = 0; flind < vm_nfreelists; flind++)
+                       sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
+                           vm_phys_lookup_lists[domain][flind]);
+       }
+       sbuf_finish(&sbuf);
+       error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
+       sbuf_delete(&sbuf);
+       free(cbuf, M_TEMP);
+       return (error);
+}
+#endif
+       
 /*
  * Create a physical memory segment.
  */
 static void
-vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
 {
        struct vm_phys_seg *seg;
 #ifdef VM_PHYSSEG_SPARSE
@@ -188,14 +238,51 @@ vm_phys_create_seg(vm_paddr_t start, vm_
        seg = &vm_phys_segs[vm_phys_nsegs++];
        seg->start = start;
        seg->end = end;
+       seg->domain = domain;
 #ifdef VM_PHYSSEG_SPARSE
        seg->first_page = &vm_page_array[pages];
 #else
        seg->first_page = PHYS_TO_VM_PAGE(start);
 #endif
+#if VM_NDOMAIN > 1
+       if (flind == VM_FREELIST_DEFAULT && domain != 0) {
+               flind = VM_NFREELIST + (domain - 1);
+               if (flind >= vm_nfreelists)
+                       vm_nfreelists = flind + 1;
+       }
+#endif
        seg->free_queues = &vm_phys_free_queues[flind];
 }
 
+static void
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+{
+       int i;
+
+       if (mem_affinity == NULL) {
+               _vm_phys_create_seg(start, end, flind, 0);
+               return;
+       }
+
+       for (i = 0;; i++) {
+               if (mem_affinity[i].end == 0)
+                       panic("Reached end of affinity info");
+               if (mem_affinity[i].end <= start)
+                       continue;
+               if (mem_affinity[i].start > start)
+                       panic("No affinity info for start %jx",
+                           (uintmax_t)start);
+               if (mem_affinity[i].end >= end) {
+                       _vm_phys_create_seg(start, end, flind,
+                           mem_affinity[i].domain);
+                       break;
+               }
+               _vm_phys_create_seg(start, mem_affinity[i].end, flind,
+                   mem_affinity[i].domain);
+               start = mem_affinity[i].end;
+       }
+}
+
 /*
  * Initialize the physical memory allocator.
  */
@@ -204,6 +291,9 @@ vm_phys_init(void)
 {
        struct vm_freelist *fl;
        int flind, i, oind, pind;
+#if VM_NDOMAIN > 1
+       int ndomains, j;
+#endif
 
        for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 #ifdef VM_FREELIST_ISADMA
@@ -246,6 +336,37 @@ vm_phys_init(void)
                                TAILQ_INIT(&fl[oind].pl);
                }
        }
+#if VM_NDOMAIN > 1
+       /*
+        * Build a free list lookup list for each domain.  All of the
+        * memory domain lists are inserted at the VM_FREELIST_DEFAULT
+        * index in a round-robin order starting with the current
+        * domain.
+        */
+       ndomains = vm_nfreelists - VM_NFREELIST + 1;
+       for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
+               for (i = 0; i < ndomains; i++)
+                       vm_phys_lookup_lists[i][flind] =
+                           &vm_phys_free_queues[flind];
+       for (i = 0; i < ndomains; i++)
+               for (j = 0; j < ndomains; j++) {
+                       flind = (i + j) % ndomains;
+                       if (flind == 0)
+                               flind = VM_FREELIST_DEFAULT;
+                       else
+                               flind += VM_NFREELIST - 1;
+                       vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
+                           &vm_phys_free_queues[flind];
+               }
+       for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
+            flind++)
+               for (i = 0; i < ndomains; i++)
+                       vm_phys_lookup_lists[i][flind + ndomains - 1] =
+                           &vm_phys_free_queues[flind];
+#else
+       for (flind = 0; flind < vm_nfreelists; flind++)
+               vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
+#endif
 }
 
 /*
@@ -321,7 +442,7 @@ vm_phys_alloc_freelist_pages(int flind, 
 {      
        struct vm_freelist *fl;
        struct vm_freelist *alt;
-       int oind, pind;
+       int domain, oind, pind;
        vm_page_t m;
 
        KASSERT(flind < VM_NFREELIST,
@@ -330,8 +451,14 @@ vm_phys_alloc_freelist_pages(int flind, 
            ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
        KASSERT(order < VM_NFREEORDER,
            ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
+
+#if VM_NDOMAIN > 1
+       domain = PCPU_GET(domain);
+#else
+       domain = 0;
+#endif
        mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-       fl = vm_phys_free_queues[flind][pool];
+       fl = (*vm_phys_lookup_lists[domain][flind])[pool];
        for (oind = order; oind < VM_NFREEORDER; oind++) {
                m = TAILQ_FIRST(&fl[oind].pl);
                if (m != NULL) {
@@ -351,7 +478,7 @@ vm_phys_alloc_freelist_pages(int flind, 
         */
        for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
                for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-                       alt = vm_phys_free_queues[flind][pind];
+                       alt = (*vm_phys_lookup_lists[domain][flind])[pind];
                        m = TAILQ_FIRST(&alt[oind].pl);
                        if (m != NULL) {
                                TAILQ_REMOVE(&alt[oind].pl, m, pageq);
@@ -613,8 +740,13 @@ vm_phys_alloc_contig(unsigned long npage
        struct vnode *vp;
        vm_paddr_t pa, pa_last, size;
        vm_page_t deferred_vdrop_list, m, m_ret;
-       int flind, i, oind, order, pind;
+       int domain, flind, i, oind, order, pind;
 
+#if VM_NDOMAIN > 1
+       domain = PCPU_GET(domain);
+#else
+       domain = 0;
+#endif
        size = npages << PAGE_SHIFT;
        KASSERT(size != 0,
            ("vm_phys_alloc_contig: size must not be 0"));
@@ -632,7 +764,8 @@ retry:
        for (flind = 0; flind < vm_nfreelists; flind++) {
                for (oind = min(order, VM_NFREEORDER - 1); oind < 
VM_NFREEORDER; oind++) {
                        for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-                               fl = vm_phys_free_queues[flind][pind];
+                               fl = (*vm_phys_lookup_lists[domain][flind])
+                                   [pind];
                                TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
                                        /*
                                         * A free list may contain physical 
pages

Modified: head/sys/vm/vm_phys.h
==============================================================================
--- head/sys/vm/vm_phys.h       Tue Jul 27 19:31:10 2010        (r210549)
+++ head/sys/vm/vm_phys.h       Tue Jul 27 20:33:50 2010        (r210550)
@@ -40,6 +40,15 @@
 
 #ifdef _KERNEL
 
+/* Domains must be dense (non-sparse) and zero-based. */
+struct mem_affinity {
+       vm_paddr_t start;
+       vm_paddr_t end;
+       int domain;
+};
+
+extern struct mem_affinity *mem_affinity;
+
 void vm_phys_add_page(vm_paddr_t pa);
 vm_page_t vm_phys_alloc_contig(unsigned long npages,
     vm_paddr_t low, vm_paddr_t high,
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to