(Note: Last week I asked about this on the freebsd-current list. It turned out slightly harder than I thought, as the 512GB kernel virtual area is based on what fits into a single L4 page table entry.)
I was asked to expand the kernel limits for amd64 systems. While I do not have a system with enough RAM to test this for real, the changes below seem to boot and run OK. I went just a little bit wild in create_pagetables(). :-) The lines with the casts got long (and hard to read) so I shortened them (but I still needed the map I drew of the page tables...). If using ptoa() like this is OK, probably there should be a few more of those, e.g., in the changes to pmap_pinit(). Anyway, I wonder if some form of this patch (perhaps even without the #ifdefs) might be accepted back. I'm not sure about the KPML4BASE name, but it clearly needs to be different from KPML4I. (At first I was considering moving KERNBASE too but the branch offsets seem to be the real limiting factor here.) Possibly dumb question: around the comment "this replaces some of the KPTphys entries above", would it be possible to reclaim a few pages by calculating in advance where the 2MB page mappings obviate the need for the underlying KPTphys pages, and just offset things? Another note: one could get rid of the "power of 2" requirement for NDMPML4E. It arises from the translation between direct mapped virtual and physical addresses (being |= and &=~), but the same result can be achieved by adding and subtracting an offset, which would allow the base and limit to be arbitrary, rather than a power of two. (Still, it did not seem worth doing here.) Chris diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c index 272158d..acf5af2 100644 --- a/amd64/amd64/pmap.c +++ b/amd64/amd64/pmap.c @@ -534,6 +534,10 @@ static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe; + pt_entry_t *pt_p; + pd_entry_t *pd_p; + pdp_entry_t *pdp_p; + pml4_entry_t *p4_p; /* Allocate page table pages for the direct map */ ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; @@ -556,6 +560,10 @@ create_pagetables(vm_paddr_t *firstaddr) * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. + * + * Note that when NKPML4E > 1, we have an empty page underneath + * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) + * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); @@ -564,32 +572,26 @@ create_pagetables(vm_paddr_t *firstaddr) KPDphys = allocpages(firstaddr, nkpdpe); /* Fill in the underlying page table pages */ - /* Read-only from zero to physfree */ + /* Nominally read-only (but really R/W) from zero to physfree */ /* XXX not fully used, underneath 2M pages */ - for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { - ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; - ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; - } + pt_p = (pt_entry_t *)KPTphys; + for (i = 0; ptoa(i) < *firstaddr; i++) + pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G; /* Now map the page tables at their location within PTmap */ - for (i = 0; i < nkpt; i++) { - ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); - ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; - } + pd_p = (pd_entry_t *)KPDphys; + for (i = 0; i < nkpt; i++) + pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ - for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { - ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; - ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; - } + for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) + pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G; - /* And connect up the PD to the PDP */ - for (i = 0; i < nkpdpe; i++) { - ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + - (i << PAGE_SHIFT); - ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; - } + /* And connect up the PD to the PDP (leaving room for L4 pages) */ + pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); + for (i = 0; i < nkpdpe; i++) + pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If @@ -599,37 +601,41 @@ create_pagetables(vm_paddr_t *firstaddr) * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ + pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { - ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT; + pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G | + pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G | PG_M | PG_A; } + pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { - ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT; + pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | + pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G | PG_M | PG_A; } for (j = 0; i < ndmpdp; i++, j++) { - ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT); - ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; + pdp_p[i] = DMPDphys + ptoa(j); + pdp_p[i] |= PG_RW | PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ - ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; - ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; + p4_p = (pml4_entry_t *)KPML4phys; + p4_p[PML4PML4I] = KPML4phys; + p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < NDMPML4E; i++) { - ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys + - (i << PAGE_SHIFT); - ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U; + p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); + p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U; } - /* Connect the KVA slot up to the PML4 */ - ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; - ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; + /* Connect the KVA slots up to the PML4 */ + for (i = 0; i < NKPML4E; i++) { + p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); + p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U; + } } /* @@ -1688,7 +1694,10 @@ pmap_pinit(pmap_t pmap) pagezero(pmap->pm_pml4); /* Wire in kernel global address entries. */ - pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; + for (i = 0; i < NKPML4E; i++) { + pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) | + PG_RW | PG_V | PG_U; + } for (i = 0; i < NDMPML4E; i++) { pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) | PG_RW | PG_V | PG_U; @@ -1944,7 +1953,8 @@ pmap_release(pmap_t pmap) m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); - pmap->pm_pml4[KPML4I] = 0; /* KVA */ + for (i = 0; i < NKPML4E; i++) /* KVA */ + pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < NDMPML4E; i++) /* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h index 6d76ec3..58d1c9d 100644 --- a/amd64/include/pmap.h +++ b/amd64/include/pmap.h @@ -113,7 +113,17 @@ ((unsigned long)(l2) << PDRSHIFT) | \ ((unsigned long)(l1) << PAGE_SHIFT)) -#define NKPML4E 1 /* number of kernel PML4 slots */ +/* + * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, + * but setting it larger than NDMPML4E makes no sense. + * + * Each slot provides .5 TB of kernel virtual space. + */ +#ifdef AMD64_HUGE +#define NKPML4E 16 +#else +#define NKPML4E 1 +#endif #define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ #define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ @@ -121,20 +131,39 @@ /* * NDMPML4E is the number of PML4 entries that are used to implement the - * direct map. It must be a power of two. + * direct map. It must be a power of two, and should generally exceed + * NKPML4E. The maximum possible value is 64; using 128 will make the + * direct map intrude into the recursive page table map. */ +#ifdef AMD64_HUGE +#define NDMPML4E 32 +#else #define NDMPML4E 2 +#endif /* - * The *PDI values control the layout of virtual memory. The starting address + * These values control the layout of virtual memory. The starting address * of the direct map, which is controlled by DMPML4I, must be a multiple of * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * + * Note: KPML4I is the index of the (single) level 4 page that maps + * the KVA that holds KERNBASE, while KPML4BASE is the index of the + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to + * KERNBASE. Similarly, if KMPL4I < NKPML4E, extra level 4 PDEs are + * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS. + * + * (KPML4I combines with KPDPI to choose where KERNBASE starts. + * Or, in other words, KPML4I provides bits 39..46 of KERNBASE, + * and KPDPI provides bits 30..38.) */ #define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ -#define KPML4I (NPML4EPG-1) /* Top 512GB for KVM */ -#define DMPML4I rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */ +#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ +#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ +#define KPML4I (NPML4EPG-1) #define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ /* diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h index 33f62bd..47a8ef8 100644 --- a/amd64/include/vmparam.h +++ b/amd64/include/vmparam.h @@ -145,18 +145,26 @@ * 0x0000000000000000 - 0x00007fffffffffff user map * 0x0000800000000000 - 0xffff7fffffffffff does not exist (hole) * 0xffff800000000000 - 0xffff804020100fff recursive page table (512GB slot) +#ifdef AMD64_HUGE + * 0xffff804020101000 - 0xffffdfffffffffff unused + * 0xffffe00000000000 - 0xffffefffffffffff 16TB direct map + * 0xfffff00000000000 - 0xfffff7ffffffffff unused + * 0xfffff80000000000 - 0xffffffffffffffff 8TB kernel map +#else * 0xffff804020101000 - 0xfffffdffffffffff unused * 0xfffffe0000000000 - 0xfffffeffffffffff 1TB direct map * 0xffffff0000000000 - 0xffffff7fffffffff unused * 0xffffff8000000000 - 0xffffffffffffffff 512GB kernel map +#endif * * Within the kernel map: * * 0xffffffff80000000 KERNBASE */ -#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1) -#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4I, NPDPEPG-512, 0, 0) +#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0) +#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \ + NPDPEPG-1, NPDEPG-1, NPTEPG-1) #define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0) #define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0) diff --git a/conf/options.amd64 b/conf/options.amd64 index 90348b7..f3ce505 100644 --- a/conf/options.amd64 +++ b/conf/options.amd64 @@ -1,6 +1,7 @@ # $FreeBSD$ # Options specific to AMD64 platform kernels +AMD64_HUGE opt_global.h AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h COUNT_XINVLTLB_HITS opt_smp.h _______________________________________________ freebsd-hackers@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/freebsd-hackers To unsubscribe, send any mail to "freebsd-hackers-unsubscr...@freebsd.org"