[PATCH 1/3] pmap: dynamically allocate the whole user page tree map

Luca Dariz Sun, 21 May 2023 01:58:40 -0700

* i386/intel/pmap.c: switch to dynamic allocation of all the page tree
  map levels for the user-space address range, using a separate kmem
  cache for each level. This allows to extend the usable memory space
  on x86_64 to use more than one L3 page for user space. The kernel
  address map is left untouched for now as it needs a different
  initialization.
* i386/intel/pmap.h: remove hardcoded user pages and add macro to
  recontruct the page-to-virtual mapping
---
 i386/intel/pmap.c | 544 ++++++++++++++++++++++------------------------
 i386/intel/pmap.h |  21 +-
 2 files changed, 277 insertions(+), 288 deletions(-)


diff --git a/i386/intel/pmap.c b/i386/intel/pmap.c
index e867ed59..3a30271e 100644
--- a/i386/intel/pmap.c
+++ b/i386/intel/pmap.c
@@ -398,6 +398,7 @@ struct pmap kernel_pmap_store;
 pmap_t         kernel_pmap;
 
 struct kmem_cache pmap_cache;  /* cache of pmap structures */
+struct kmem_cache pt_cache;    /* cache of page tables */
 struct kmem_cache pd_cache;    /* cache of page directories */
 #if PAE
 struct kmem_cache pdpt_cache;  /* cache of page directory pointer tables */
@@ -429,6 +430,14 @@ pt_entry_t *kernel_page_dir;
  */
 static pmap_mapwindow_t mapwindows[PMAP_NMAPWINDOWS * NCPUS];
 
+#ifdef __x86_64__
+static inline pt_entry_t *
+pmap_l4base(const pmap_t pmap, vm_offset_t lin_addr)
+{
+       return &pmap->l4base[lin2l4num(lin_addr)];
+}
+#endif
+
 #ifdef PAE
 static inline pt_entry_t *
 pmap_ptp(const pmap_t pmap, vm_offset_t lin_addr)
@@ -443,7 +452,7 @@ pmap_ptp(const pmap_t pmap, vm_offset_t lin_addr)
 #else /* __x86_64__ */
        pdp_table = pmap->pdpbase;
 #endif /* __x86_64__ */
-       return pdp_table;
+       return &pdp_table[lin2pdpnum(lin_addr)];
 }
 #endif
 
@@ -456,7 +465,9 @@ pmap_pde(const pmap_t pmap, vm_offset_t addr)
 #if PAE
        pt_entry_t *pdp_table;
        pdp_table = pmap_ptp(pmap, addr);
-       pt_entry_t pde = pdp_table[lin2pdpnum(addr)];
+        if (pdp_table == 0)
+               return(PT_ENTRY_NULL);
+       pt_entry_t pde = *pdp_table;
        if ((pde & INTEL_PTE_VALID) == 0)
                return PT_ENTRY_NULL;
        page_dir = (pt_entry_t *) ptetokv(pde);
@@ -1092,15 +1103,18 @@ void pmap_init(void)
         */
        s = (vm_size_t) sizeof(struct pmap);
        kmem_cache_init(&pmap_cache, "pmap", s, 0, NULL, 0);
-       kmem_cache_init(&pd_cache, "pd",
+       kmem_cache_init(&pt_cache, "pmap_L1",
+                       INTEL_PGBYTES, INTEL_PGBYTES, NULL,
+                       KMEM_CACHE_PHYSMEM);
+       kmem_cache_init(&pd_cache, "pmap_L2",
                        INTEL_PGBYTES, INTEL_PGBYTES, NULL,
                        KMEM_CACHE_PHYSMEM);
 #if PAE
-       kmem_cache_init(&pdpt_cache, "pdpt",
+       kmem_cache_init(&pdpt_cache, "pmap_L3",
                        INTEL_PGBYTES, INTEL_PGBYTES, NULL,
                        KMEM_CACHE_PHYSMEM);
 #ifdef __x86_64__
-       kmem_cache_init(&l4_cache, "L4",
+       kmem_cache_init(&l4_cache, "pmap_L4",
                        INTEL_PGBYTES, INTEL_PGBYTES, NULL,
                        KMEM_CACHE_PHYSMEM);
 #endif /* __x86_64__ */
@@ -1244,6 +1258,11 @@ pmap_page_table_page_dealloc(vm_offset_t pa)
        vm_object_lock(pmap_object);
        m = vm_page_lookup(pmap_object, pa);
        vm_page_lock_queues();
+#ifdef MACH_PV_PAGETABLES
+        if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa)))
+                panic("couldn't unpin page %llx(%lx)\n", pa, (vm_offset_t) 
kv_to_ma(pa));
+        pmap_set_page_readwrite((void*) phystokv(pa));
+#endif /* MACH_PV_PAGETABLES */
        vm_page_free(m);
        inuse_ptepages_count--;
        vm_page_unlock_queues();
@@ -1265,7 +1284,7 @@ pmap_page_table_page_dealloc(vm_offset_t pa)
 pmap_t pmap_create(vm_size_t size)
 {
 #ifdef __x86_64__
-       // needs to be reworked if we want to dynamically allocate PDPs
+       // needs to be reworked if we want to dynamically allocate PDPs for 
kernel
        const int PDPNUM = PDPNUM_KERNEL;
 #endif
        pt_entry_t              *page_dir[PDPNUM];
@@ -1360,30 +1379,6 @@ pmap_t pmap_create(vm_size_t size)
        memset(p->l4base, 0, INTEL_PGBYTES);
        WRITE_PTE(&p->l4base[lin2l4num(VM_MIN_KERNEL_ADDRESS)],
                  pa_to_pte(kvtophys((vm_offset_t) pdp_kernel)) | 
INTEL_PTE_VALID | INTEL_PTE_WRITE);
-#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS)
-       // kernel vm and user vm are not in the same l4 entry, so add the user 
one
-        // TODO alloc only PDPTE for the user range VM_MIN_USER_ADDRESS, 
VM_MAX_USER_ADDRESS
-       // and keep the same for kernel range, in l4 table we have different 
entries
-       pt_entry_t *pdp_user = (pt_entry_t *) kmem_cache_alloc(&pdpt_cache);
-       if (pdp_user == NULL) {
-               panic("pmap create");
-       }
-        memset(pdp_user, 0, INTEL_PGBYTES);
-       WRITE_PTE(&p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)],
-                 pa_to_pte(kvtophys((vm_offset_t) pdp_user)) | INTEL_PTE_VALID 
| INTEL_PTE_WRITE | INTEL_PTE_USER);
-#endif /* lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS) */
-       for (int i = 0; i < PDPNUM_USER; i++) {
-               pt_entry_t *user_page_dir = (pt_entry_t *) 
kmem_cache_alloc(&pd_cache);
-               memset(user_page_dir, 0, INTEL_PGBYTES);
-               WRITE_PTE(&pdp_user[i + lin2pdpnum(VM_MIN_USER_ADDRESS)],  // 
pdp_user
-                         pa_to_pte(kvtophys((vm_offset_t)user_page_dir))
-                         | INTEL_PTE_VALID
-#if (defined(__x86_64__) && !defined(MACH_HYP)) || defined(MACH_PV_PAGETABLES)
-                         | INTEL_PTE_WRITE | INTEL_PTE_USER
-#endif
-                       );
-       }
-
 #ifdef MACH_PV_PAGETABLES
        // FIXME: use kmem_cache_alloc instead
        if (kmem_alloc_wired(kernel_map,
@@ -1443,15 +1438,7 @@ pmap_t pmap_create(vm_size_t size)
 
 void pmap_destroy(pmap_t p)
 {
-#if PAE
-       int             i;
-#endif
-       boolean_t       free_all;
-       pt_entry_t      *page_dir;
-       pt_entry_t      *pdep;
-       phys_addr_t     pa;
        int             c, s;
-       vm_page_t       m;
 
        if (p == PMAP_NULL)
                return;
@@ -1466,87 +1453,54 @@ void pmap_destroy(pmap_t p)
            return;     /* still in use */
        }
 
+        /*
+         * Free the page table tree.
+         */
 #if PAE
-       for (i = 0; i < lin2pdpnum(VM_MAX_USER_ADDRESS); i++) {
 #ifdef __x86_64__
-#ifdef USER32
-           /* In this case we know we have one PDP for user space */
-           pt_entry_t *pdp = (pt_entry_t *) 
ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
-#else
-#warning "TODO do 64-bit userspace need more that 512G?"
-           pt_entry_t *pdp = (pt_entry_t *) 
ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
-#endif /* USER32 */
-           page_dir = (pt_entry_t *) ptetokv(pdp[i]);
+       for (int l4i = 0; l4i < lin2l4num(VM_MAX_USER_ADDRESS); l4i++) {
+               pt_entry_t pdp = (pt_entry_t) p->l4base[l4i];
+               if (!(pdp & INTEL_PTE_VALID))
+                       continue;
+               pt_entry_t *pdpbase = (pt_entry_t*) ptetokv(pdp);
+               for (int l3i = 0; l3i < 512; l3i++) {
 #else /* __x86_64__ */
-           page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
+               pt_entry_t *pdpbase = p->pdpbase;
+               for (int l3i = 0; l3i < lin2pdpnum(VM_MAX_USER_ADDRESS); l3i++) 
{
 #endif /* __x86_64__ */
-           free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
+                       pt_entry_t pde = (pt_entry_t) pdpbase[l3i];
+                       if (!(pde & INTEL_PTE_VALID))
+                               continue;
+                       pt_entry_t *pdebase = (pt_entry_t*) ptetokv(pde);
+                       for (int l2i = 0; l2i < 512; l2i++) {
 #else /* PAE */
-           free_all = FALSE;
-           page_dir = p->dirbase;
+                       pt_entry_t *pdebase = p->dirbase;
+                       for (int l2i = 0; l2i < 
lin2pdenum(VM_MAX_USER_ADDRESS); l2i++) {
 #endif /* PAE */
-
-#ifdef __x86_64__
-#warning FIXME 64bit need to free l3
-#endif
-           /*
-            *  Free the memory maps, then the
-            *  pmap structure.
-            */
-           for (pdep = page_dir;
-                (free_all
-                 || pdep < &page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)])
-                    && pdep < &page_dir[NPTES];
-                pdep += ptes_per_vm_page) {
-               if (*pdep & INTEL_PTE_VALID) {
-                   pa = pte_to_pa(*pdep);
-                   assert(pa == (vm_offset_t) pa);
-                   vm_object_lock(pmap_object);
-                   m = vm_page_lookup(pmap_object, pa);
-                   if (m == VM_PAGE_NULL)
-                       panic("pmap_destroy: pte page not in object");
-                   vm_page_lock_queues();
-#ifdef MACH_PV_PAGETABLES
-                   if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, pa_to_mfn(pa)))
-                       panic("pmap_destroy: couldn't unpin page %llx(%lx)\n", 
pa, (vm_offset_t) kv_to_ma(pa));
-                   pmap_set_page_readwrite((void*) phystokv(pa));
-#endif /* MACH_PV_PAGETABLES */
-                   vm_page_free(m);
-                   inuse_ptepages_count--;
-                   vm_page_unlock_queues();
-                   vm_object_unlock(pmap_object);
-               }
-           }
-#ifdef MACH_PV_PAGETABLES
-           pmap_set_page_readwrite((void*) page_dir);
-#endif /* MACH_PV_PAGETABLES */
-           kmem_cache_free(&pd_cache, (vm_offset_t) page_dir);
+                               pt_entry_t pte = (pt_entry_t) pdebase[l2i];
+                               if (!(pte & INTEL_PTE_VALID))
+                                       continue;
+                               kmem_cache_free(&pt_cache, 
(vm_offset_t)ptetokv(pte));
+                       }
 #if PAE
-       }
-
-#ifdef MACH_PV_PAGETABLES
+                       kmem_cache_free(&pd_cache, (vm_offset_t)pdebase);
+               }
 #ifdef __x86_64__
-       pmap_set_page_readwrite(p->l4base);
-       pmap_set_page_readwrite(p->user_l4base);
-       pmap_set_page_readwrite(p->user_pdpbase);
+               kmem_cache_free(&pdpt_cache, (vm_offset_t)pdpbase);
+       }
 #endif /* __x86_64__ */
-       pmap_set_page_readwrite(p->pdpbase);
-#endif /* MACH_PV_PAGETABLES */
+#endif /* PAE */
 
+        /* Finally, free the page table tree root and the pmap itself */
+#if PAE
 #ifdef __x86_64__
-       kmem_cache_free(&pdpt_cache, (vm_offset_t) pmap_ptp(p, 
VM_MIN_USER_ADDRESS));
-#if lin2l4num(VM_MIN_KERNEL_ADDRESS) != lin2l4num(VM_MAX_USER_ADDRESS)
-       // TODO kernel vm and user vm are not in the same l4 entry
-#endif
        kmem_cache_free(&l4_cache, (vm_offset_t) p->l4base);
-#ifdef MACH_PV_PAGETABLES
-       kmem_free(kernel_map, (vm_offset_t)p->user_l4base, INTEL_PGBYTES);
-       kmem_free(kernel_map, (vm_offset_t)p->user_pdpbase, INTEL_PGBYTES);
-#endif /* MACH_PV_PAGETABLES */
 #else /* __x86_64__ */
-       kmem_cache_free(&pdpt_cache, (vm_offset_t) p->pdpbase);
+        kmem_cache_free(&pdpt_cache, (vm_offset_t) p->pdpbase);
 #endif /* __x86_64__ */
-#endif /* PAE */
+#else /* PAE */
+        kmem_cache_free(&pd_cache, (vm_offset_t) p->dirbase);
+#endif /* PAE */
        kmem_cache_free(&pmap_cache, (vm_offset_t) p);
 }
 
@@ -1756,7 +1710,7 @@ void pmap_remove(
            l = (s + PDE_MAPPED_SIZE) & ~(PDE_MAPPED_SIZE-1);
            if (l > e)
                l = e;
-           if (*pde & INTEL_PTE_VALID) {
+           if (pde && (*pde & INTEL_PTE_VALID)) {
                spte = (pt_entry_t *)ptetokv(*pde);
                spte = &spte[ptenum(s)];
                epte = &spte[intel_btop(l-s)];
@@ -2036,86 +1990,24 @@ void pmap_protect(
        SPLX(spl);
 }
 
+typedef        pt_entry_t* (*pmap_level_getter_t)(const pmap_t pmap, 
vm_offset_t addr);
 /*
- *     Insert the given physical page (p) at
- *     the specified virtual address (v) in the
- *     target physical map with the protection requested.
- *
- *     If specified, the page will be wired down, meaning
- *     that the related pte can not be reclaimed.
- *
- *     NB:  This is the only routine which MAY NOT lazy-evaluate
- *     or lose information.  That is, this routine must actually
- *     insert this page into the given map NOW.
- */
-void pmap_enter(
-       pmap_t                  pmap,
-       vm_offset_t             v,
-       phys_addr_t             pa,
-       vm_prot_t               prot,
-       boolean_t               wired)
+* Expand one single level of the page table tree
+*/
+static inline pt_entry_t* pmap_expand_level(pmap_t pmap, vm_offset_t v, int 
spl,
+                                            pmap_level_getter_t pmap_level,
+                                            pmap_level_getter_t 
pmap_level_upper,
+                                            int n_per_vm_page,
+                                            struct kmem_cache *cache)
 {
-       boolean_t               is_physmem;
        pt_entry_t              *pte;
-       pv_entry_t              pv_h;
-       unsigned long           i, pai;
-       pv_entry_t              pv_e;
-       pt_entry_t              template;
-       int                     spl;
-       phys_addr_t             old_pa;
-
-       assert(pa != vm_page_fictitious_addr);
-       if (pmap_debug) printf("pmap(%zx, %llx)\n", v, (unsigned long long) pa);
-       if (pmap == PMAP_NULL)
-               return;
-
-#if !MACH_KDB
-       if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= 
kernel_virtual_end))
-               panic("pmap_enter(%zx, %llx) falls in physical memory area!\n", 
v, (unsigned long long) pa);
-#endif
-#if !(__i486__ || __i586__ || __i686__)
-       if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0
-           && !wired /* hack for io_wire */ ) {
-           /*
-            *  Because the 386 ignores write protection in kernel mode,
-            *  we cannot enter a read-only kernel mapping, and must
-            *  remove an existing mapping if changing it.
-            */
-           PMAP_READ_LOCK(pmap, spl);
-
-           pte = pmap_pte(pmap, v);
-           if (pte != PT_ENTRY_NULL && *pte != 0) {
-               /*
-                *      Invalidate the translation buffer,
-                *      then remove the mapping.
-                */
-               pmap_remove_range(pmap, v, pte,
-                                 pte + ptes_per_vm_page);
-               PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
-           }
-           PMAP_READ_UNLOCK(pmap, spl);
-           return;
-       }
-#endif
-
-       /*
-        *      Must allocate a new pvlist entry while we're unlocked;
-        *      Allocating may cause pageout (which will lock the pmap system).
-        *      If we determine we need a pvlist entry, we will unlock
-        *      and allocate one.  Then we will retry, throughing away
-        *      the allocated entry later (if we no longer need it).
-        */
-       pv_e = PV_ENTRY_NULL;
-Retry:
-       PMAP_READ_LOCK(pmap, spl);
 
        /*
         *      Expand pmap to include this pte.  Assume that
         *      pmap is always expanded to include enough hardware
         *      pages to map one VM page.
         */
-
-       while ((pte = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
+       while ((pte = pmap_level(pmap, v)) == PT_ENTRY_NULL) {
            /*
             * Need to allocate a new page-table page.
             */
@@ -2136,7 +2028,9 @@ Retry:
             */
            PMAP_READ_UNLOCK(pmap, spl);
 
-           ptp = phystokv(pmap_page_table_page_alloc());
+           while (!(ptp = kmem_cache_alloc(cache)))
+               VM_PAGE_WAIT((void (*)()) 0);
+           memset((void *)ptp, 0, PAGE_SIZE);
 
            /*
             * Re-lock the pmap and check that another thread has
@@ -2146,12 +2040,12 @@ Retry:
             */
            PMAP_READ_LOCK(pmap, spl);
 
-           if (pmap_pte(pmap, v) != PT_ENTRY_NULL) {
+           if (pmap_level(pmap, v) != PT_ENTRY_NULL) {
                /*
                 * Oops...
                 */
                PMAP_READ_UNLOCK(pmap, spl);
-               pmap_page_table_page_dealloc(kvtophys(ptp));
+               kmem_cache_free(cache, ptp);
                PMAP_READ_LOCK(pmap, spl);
                continue;
            }
@@ -2159,8 +2053,8 @@ Retry:
            /*
             * Enter the new page table page in the page directory.
             */
-           i = ptes_per_vm_page;
-           pdp = pmap_pde(pmap, v);
+           i = n_per_vm_page;
+           pdp = pmap_level_upper(pmap, v);
            do {
 #ifdef MACH_PV_PAGETABLES
                pmap_set_page_readonly((void *) ptp);
@@ -2185,6 +2079,100 @@ Retry:
             */
            continue;
        }
+        return pte;
+}
+
+/*
+ * Expand, if required, the PMAP to include the virtual address V.
+ * PMAP needs to be locked, and it will be still locked on return. It
+ * can temporarily unlock the PMAP, during allocation or deallocation
+ * of physical pages.
+ */
+static inline pt_entry_t* pmap_expand(pmap_t pmap, vm_offset_t v, int spl)
+{
+#ifdef PAE
+#ifdef __x86_64__
+       pmap_expand_level(pmap, v, spl, pmap_ptp, pmap_l4base, 
ptes_per_vm_page, &pdpt_cache);
+#endif /* __x86_64__ */
+       pmap_expand_level(pmap, v, spl, pmap_pde, pmap_ptp, ptes_per_vm_page, 
&pd_cache);
+#endif /* PAE */
+       return pmap_expand_level(pmap, v, spl, pmap_pte, pmap_pde, 
ptes_per_vm_page, &pt_cache);
+}
+
+/*
+ *     Insert the given physical page (p) at
+ *     the specified virtual address (v) in the
+ *     target physical map with the protection requested.
+ *
+ *     If specified, the page will be wired down, meaning
+ *     that the related pte can not be reclaimed.
+ *
+ *     NB:  This is the only routine which MAY NOT lazy-evaluate
+ *     or lose information.  That is, this routine must actually
+ *     insert this page into the given map NOW.
+ */
+void pmap_enter(
+       pmap_t                  pmap,
+       vm_offset_t             v,
+       phys_addr_t             pa,
+       vm_prot_t               prot,
+       boolean_t               wired)
+{
+       boolean_t               is_physmem;
+       pt_entry_t              *pte;
+       pv_entry_t              pv_h;
+       unsigned long           i, pai;
+       pv_entry_t              pv_e;
+       pt_entry_t              template;
+       int                     spl;
+       phys_addr_t             old_pa;
+
+       assert(pa != vm_page_fictitious_addr);
+       if (pmap_debug) printf("pmap(%zx, %llx)\n", v, (unsigned long long) pa);
+       if (pmap == PMAP_NULL)
+               return;
+
+#if !MACH_KDB
+       if (pmap == kernel_pmap && (v < kernel_virtual_start || v >= 
kernel_virtual_end))
+               panic("pmap_enter(%llx, %llx) falls in physical memory 
area!\n", v, (unsigned long long) pa);
+#endif
+#if !(__i486__ || __i586__ || __i686__)
+       if (pmap == kernel_pmap && (prot & VM_PROT_WRITE) == 0
+           && !wired /* hack for io_wire */ ) {
+           /*
+            *  Because the 386 ignores write protection in kernel mode,
+            *  we cannot enter a read-only kernel mapping, and must
+            *  remove an existing mapping if changing it.
+            */
+           PMAP_READ_LOCK(pmap, spl);
+
+           pte = pmap_pte(pmap, v);
+           if (pte != PT_ENTRY_NULL && *pte != 0) {
+               /*
+                *      Invalidate the translation buffer,
+                *      then remove the mapping.
+                */
+               pmap_remove_range(pmap, v, pte,
+                                 pte + ptes_per_vm_page);
+               PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
+           }
+           PMAP_READ_UNLOCK(pmap, spl);
+           return;
+       }
+#endif
+
+       /*
+        *      Must allocate a new pvlist entry while we're unlocked;
+        *      Allocating may cause pageout (which will lock the pmap system).
+        *      If we determine we need a pvlist entry, we will unlock
+        *      and allocate one.  Then we will retry, throughing away
+        *      the allocated entry later (if we no longer need it).
+        */
+       pv_e = PV_ENTRY_NULL;
+Retry:
+       PMAP_READ_LOCK(pmap, spl);
+
+       pte = pmap_expand(pmap, v, spl);
 
        if (vm_page_ready())
                is_physmem = (vm_page_lookup_pa(pa) != NULL);
@@ -2462,10 +2450,7 @@ void pmap_copy(
  */
 void pmap_collect(pmap_t p)
 {
-       int                     i;
-       boolean_t               free_all;
-       pt_entry_t              *page_dir;
-       pt_entry_t              *pdp, *ptp;
+       pt_entry_t              *ptp;
        pt_entry_t              *eptp;
        phys_addr_t             pa;
        int                     spl, wired;
@@ -2476,119 +2461,104 @@ void pmap_collect(pmap_t p)
        if (p == kernel_pmap)
                return;
 
+       /*
+        * Free the page table tree.
+        */
 #if PAE
-       for (i = 0; i < lin2pdpnum(VM_MAX_USER_ADDRESS); i++) {
 #ifdef __x86_64__
-#ifdef USER32
-           /* In this case we know we have one PDP for user space */
-           pdp = (pt_entry_t *) 
ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
-#else
-#warning "TODO do 64-bit userspace need more that 512G?"
-           pdp = (pt_entry_t *) 
ptetokv(p->l4base[lin2l4num(VM_MIN_USER_ADDRESS)]);
-#endif /* USER32 */
-           page_dir = (pt_entry_t *) ptetokv(pdp[i]);
+       for (int l4i = 0; l4i < lin2l4num(VM_MAX_USER_ADDRESS); l4i++) {
+               pt_entry_t pdp = (pt_entry_t) p->l4base[l4i];
+               if (!(pdp & INTEL_PTE_VALID))
+                       continue;
+               pt_entry_t *pdpbase = (pt_entry_t*) ptetokv(pdp);
+               for (int l3i = 0; l3i < 512; l3i++) {
 #else /* __x86_64__ */
-           page_dir = (pt_entry_t *) ptetokv(p->pdpbase[i]);
+               pt_entry_t *pdpbase = p->pdpbase;
+               for (int l3i = 0; l3i < lin2pdpnum(VM_MAX_USER_ADDRESS); l3i++) 
{
 #endif /* __x86_64__ */
-           free_all = i < lin2pdpnum(LINEAR_MIN_KERNEL_ADDRESS);
-#else
-           i = 0;
-           free_all = FALSE;
-           page_dir = p->dirbase;
-#endif
-
-           /*
-            *  Garbage collect map.
-            */
-           PMAP_READ_LOCK(p, spl);
-           for (pdp = page_dir;
-                (free_all
-                 || pdp < &page_dir[lin2pdenum(LINEAR_MIN_KERNEL_ADDRESS)])
-                    && pdp < &page_dir[NPTES];
-                pdp += ptes_per_vm_page) {
-               if (*pdp & INTEL_PTE_VALID) {
-
-                   pa = pte_to_pa(*pdp);
-                   ptp = (pt_entry_t *)phystokv(pa);
-                   eptp = ptp + NPTES*ptes_per_vm_page;
-
-                   /*
-                    * If the pte page has any wired mappings, we cannot
-                    * free it.
-                    */
-                   wired = 0;
-                   {
-                       pt_entry_t *ptep;
-                       for (ptep = ptp; ptep < eptp; ptep++) {
-                           if (*ptep & INTEL_PTE_WIRED) {
-                               wired = 1;
-                               break;
-                           }
-                       }
-                   }
-                   if (!wired) {
-                       /*
-                        * Remove the virtual addresses mapped by this pte page.
-                        */
-                       { /*XXX big hack*/
-                           vm_offset_t va = pdenum2lin(pdp - page_dir
-                                                       + i * NPTES);
-                           if (p == kernel_pmap)
-                               va = lintokv(va);
-                           pmap_remove_range(p,
-                                             va,
-                                             ptp,
-                                             eptp);
-                       }
-
-                       /*
-                        * Invalidate the page directory pointer.
-                        */
-                       {
-                           int i = ptes_per_vm_page;
-                           pt_entry_t *pdep = pdp;
-                           do {
+                       pt_entry_t pde = (pt_entry_t ) pdpbase[l3i];
+                       if (!(pde & INTEL_PTE_VALID))
+                               continue;
+                       pt_entry_t *pdebase = (pt_entry_t*) ptetokv(pde);
+                       for (int l2i = 0; l2i < 512; l2i++) {
+#else /* PAE */
+                       pt_entry_t *pdebase = p->dirbase;
+                       for (int l2i = 0; l2i < 
lin2pdenum(VM_MAX_USER_ADDRESS); l2i++) {
+#endif /* PAE */
+                               pt_entry_t pte = (pt_entry_t) pdebase[l2i];
+                               if (!(pte & INTEL_PTE_VALID))
+                                       continue;
+
+                               pa = pte_to_pa(pte);
+                               ptp = (pt_entry_t *)phystokv(pa);
+                               eptp = ptp + NPTES*ptes_per_vm_page;
+
+                               /*
+                                * If the pte page has any wired mappings, we 
cannot
+                                * free it.
+                                */
+                               wired = 0;
+                               {
+                                   pt_entry_t *ptep;
+                                   for (ptep = ptp; ptep < eptp; ptep++) {
+                                       if (*ptep & INTEL_PTE_WIRED) {
+                                           wired = 1;
+                                           break;
+                                       }
+                                   }
+                               }
+                               if (!wired) {
+                                   /*
+                                    * Remove the virtual addresses mapped by 
this pte page.
+                                    */
+                                   { /*XXX big hack*/
+                                       vm_offset_t va = pagenum2lin(l4i, l3i, 
l2i, 0);
+                                       if (p == kernel_pmap)
+                                           va = lintokv(va);
+                                       pmap_remove_range(p, va, ptp, eptp);
+                                   }
+
+                                   /*
+                                    * Invalidate the page directory pointer.
+                                    */
+                                   {
+                                       int i = ptes_per_vm_page;
+                                       pt_entry_t *pdep = &pdebase[l2i];
+                                       do {
 #ifdef MACH_PV_PAGETABLES
-                               unsigned long pte = *pdep;
-                               void *ptable = (void*) ptetokv(pte);
-                               if 
(!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0)))
-                                   panic("%s:%d could not clear pde 
%p\n",__FILE__,__LINE__,pdep-1);
-                               if (!hyp_mmuext_op_mfn (MMUEXT_UNPIN_TABLE, 
kv_to_mfn(ptable)))
-                                   panic("couldn't unpin page %p(%lx)\n", 
ptable, (vm_offset_t) pa_to_ma(kvtophys((vm_offset_t)ptable)));
-                               pmap_set_page_readwrite(ptable);
+                                           unsigned long pte = *pdep;
+                                           void *ptable = (void*) ptetokv(pte);
+                                           if 
(!(hyp_mmu_update_pte(pa_to_ma(kvtophys((vm_offset_t)pdep++)), 0)))
+                                               panic("%s:%d could not clear 
pde %p\n",__FILE__,__LINE__,pdep-1);
+                                           if (!hyp_mmuext_op_mfn 
(MMUEXT_UNPIN_TABLE, kv_to_mfn(ptable)))
+                                               panic("couldn't unpin page 
%p(%lx)\n", ptable, (vm_offset_t) pa_to_ma(kvtophys((vm_offset_t)ptable)));
+                                           pmap_set_page_readwrite(ptable);
 #else  /* MACH_PV_PAGETABLES */
-                               *pdep++ = 0;
+                                           *pdep++ = 0;
 #endif /* MACH_PV_PAGETABLES */
-                           } while (--i > 0);
-                       }
+                                       } while (--i > 0);
+                                   }
 
-                       PMAP_READ_UNLOCK(p, spl);
+                                   PMAP_READ_UNLOCK(p, spl);
 
-                       /*
-                        * And free the pte page itself.
-                        */
-                       {
-                           vm_page_t m;
-
-                           vm_object_lock(pmap_object);
-                           assert(pa == (vm_offset_t) pa);
-                           m = vm_page_lookup(pmap_object, pa);
-                           if (m == VM_PAGE_NULL)
-                               panic("pmap_collect: pte page not in object");
-                           vm_page_lock_queues();
-                           vm_page_free(m);
-                           inuse_ptepages_count--;
-                           vm_page_unlock_queues();
-                           vm_object_unlock(pmap_object);
-                       }
+                                   /*
+                                    * And free the pte page itself.
+                                    */
+                                   kmem_cache_free(&pt_cache, 
(vm_offset_t)ptetokv(pte));
 
-                       PMAP_READ_LOCK(p, spl);
-                   }
-               }
-           }
+                                   PMAP_READ_LOCK(p, spl);
+
+                               }
+                       }
 #if PAE
+                       // TODO check l2?
+               }
+#ifdef __x86_64__
+                       // TODO check l3?
        }
-#endif
+#endif /* __x86_64__ */
+#endif /* PAE */
+
        PMAP_UPDATE_TLBS(p, VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
 
        PMAP_READ_UNLOCK(p, spl);
diff --git a/i386/intel/pmap.h b/i386/intel/pmap.h
index 4c1b9bd5..5fc7fb25 100644
--- a/i386/intel/pmap.h
+++ b/i386/intel/pmap.h
@@ -75,7 +75,6 @@ typedef phys_addr_t pt_entry_t;
 #define L4SHIFT                39      /* L4 shift */
 #define L4MASK         0x1ff   /* mask for L4 index */
 #define PDPNUM_KERNEL  (((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) >> 
PDPSHIFT) + 1)
-#define PDPNUM_USER    (((VM_MAX_USER_ADDRESS - VM_MIN_USER_ADDRESS) >> 
PDPSHIFT) + 1)
 #define PDPMASK                0x1ff   /* mask for page directory pointer 
index */
 #else /* __x86_64__ */
 #define PDPNUM         4       /* number of page directory pointers */
@@ -130,6 +129,26 @@ typedef phys_addr_t pt_entry_t;
  */
 #define pdenum2lin(a)  ((vm_offset_t)(a) << PDESHIFT)
 
+#if PAE
+#ifdef __x86_64__
+#define pagenum2lin(l4num, l3num, l2num, l1num) \
+    (((vm_offset_t)(l4num) << L4SHIFT) +        \
+     ((vm_offset_t)(l3num) << PDPSHIFT) +       \
+     ((vm_offset_t)(l2num) << PDESHIFT) +       \
+     ((vm_offset_t)(l1num) << PTESHIFT))
+#else /* __x86_64__ */
+#define pagenum2lin(l4num, l3num, l2num, l1num) \
+    (((vm_offset_t)(l3num) << PDPSHIFT) +       \
+     ((vm_offset_t)(l2num) << PDESHIFT) +       \
+     ((vm_offset_t)(l1num) << PTESHIFT))
+#endif
+#else /* PAE */
+#define pagenum2lin(l4num, l3num, l2num, l1num) \
+    (((vm_offset_t)(l2num) << PDESHIFT) +       \
+     ((vm_offset_t)(l1num) << PTESHIFT))
+#endif
+
+
 /*
  *     Convert linear offset to page table index
  */
-- 
2.30.2

[PATCH 1/3] pmap: dynamically allocate the whole user page tree map

Reply via email to