The branch main has been updated by alc:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3e00c11a4f43bf1c7b88d25638e2bfee399e7674

commit 3e00c11a4f43bf1c7b88d25638e2bfee399e7674
Author:     Alan Cox <a...@freebsd.org>
AuthorDate: 2024-07-12 07:44:56 +0000
Commit:     Alan Cox <a...@freebsd.org>
CommitDate: 2024-07-13 17:43:42 +0000

    arm64: Support the L3 ATTR_CONTIGUOUS page size in pagesizes[]
    
    Update pagesizes[] to include the L3 ATTR_CONTIGUOUS (L3C) page size,
    which is 64KB when the base page size is 4KB and 2MB when the base page
    size is 16KB.
    
    Add support for L3C pages to shm_create_largepage().
    
    Add support for creating L3C page mappings to pmap_enter(psind=1).
    
    Add support for reporting L3C page mappings to mincore(2) and
    procstat(8).
    
    Update vm_fault_soft_fast() and vm_fault_populate() to handle multiple
    superpage sizes.
    
    Declare arm64 as supporting two superpage reservation sizes, and
    simulate two superpage reservation sizes, updating the vm_page's psind
    field to reflect the correct page size from pagesizes[].  (The next
    patch in this series will replace this simulation.  This patch is
    already big enough.)
    
    Co-authored-by: Eliot Solomon <e...@rice.edu>
    Reviewed by:    kib
    Differential Revision:  https://reviews.freebsd.org/D45766
---
 share/man/man7/arch.7       |   2 +-
 sys/arm64/arm64/pmap.c      | 162 +++++++++++++++++++++++++++++++++-----------
 sys/arm64/include/param.h   |   2 +-
 sys/arm64/include/vmparam.h |  25 ++++---
 sys/kern/imgact_elf.c       |   8 ++-
 sys/kern/kern_mib.c         |   8 ++-
 sys/kern/kern_proc.c        |  12 +++-
 sys/kern/uipc_shm.c         |  15 +++-
 sys/sys/mman.h              |   4 +-
 sys/vm/vm_domainset.c       |   3 +
 sys/vm/vm_fault.c           |  32 ++++++---
 sys/vm/vm_glue.c            |   5 +-
 sys/vm/vm_kern.c            |   5 +-
 sys/vm/vm_map.c             |  46 ++++++++++---
 sys/vm/vm_page.c            |   6 +-
 sys/vm/vm_page.h            |   2 +-
 sys/vm/vm_reserv.c          | 104 ++++++++++++++++++++++++----
 17 files changed, 344 insertions(+), 97 deletions(-)

diff --git a/share/man/man7/arch.7 b/share/man/man7/arch.7
index f3d2e1036706..88228b807e6a 100644
--- a/share/man/man7/arch.7
+++ b/share/man/man7/arch.7
@@ -218,7 +218,7 @@ is 8 bytes on all supported architectures except i386.
 .Ss Page Size
 .Bl -column -offset indent "Architecture" "Page Sizes"
 .It Sy Architecture Ta Sy Page Sizes
-.It aarch64     Ta 4K, 2M, 1G
+.It aarch64     Ta 4K, 64K, 2M, 1G
 .It amd64       Ta 4K, 2M, 1G
 .It armv7       Ta 4K, 1M
 .It i386        Ta 4K, 2M (PAE), 4M
diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c
index c3357900e1be..2540b5eaf4b9 100644
--- a/sys/arm64/arm64/pmap.c
+++ b/sys/arm64/arm64/pmap.c
@@ -1631,11 +1631,14 @@ pmap_init(void)
        if (superpages_enabled) {
                KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
                    ("pmap_init: can't assign to pagesizes[1]"));
-               pagesizes[1] = L2_SIZE;
+               pagesizes[1] = L3C_SIZE;
+               KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
+                   ("pmap_init: can't assign to pagesizes[2]"));
+               pagesizes[2] = L2_SIZE;
                if (L1_BLOCKS_SUPPORTED) {
-                       KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
-                           ("pmap_init: can't assign to pagesizes[2]"));
-                       pagesizes[2] = L1_SIZE;
+                       KASSERT(MAXPAGESIZES > 3 && pagesizes[3] == 0,
+                           ("pmap_init: can't assign to pagesizes[3]"));
+                       pagesizes[3] = L1_SIZE;
                }
        }
 
@@ -4959,7 +4962,7 @@ static int
 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
     int psind)
 {
-       pd_entry_t *l0p, *l1p, *l2p, newpte, origpte;
+       pd_entry_t *l0p, *l1p, *l2p, *l3p, newpte, origpte, *tl3p;
        vm_page_t mp;
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -4973,9 +4976,11 @@ restart:
        newpte = pte;
        if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
                return (KERN_PROTECTION_FAILURE);
-       if (psind == 2) {
+       if (psind == 3) {
                PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
 
+               KASSERT(pagesizes[psind] == L1_SIZE,
+                   ("pagesizes[%d] != L1_SIZE", psind));
                l0p = pmap_l0(pmap, va);
                if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
                        mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
@@ -5005,7 +5010,9 @@ restart:
                    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
                    va, origpte, newpte));
                pmap_store(l1p, newpte);
-       } else /* (psind == 1) */ {
+       } else if (psind == 2) {
+               KASSERT(pagesizes[psind] == L2_SIZE,
+                   ("pagesizes[%d] != L2_SIZE", psind));
                l2p = pmap_l2(pmap, va);
                if (l2p == NULL) {
                        mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
@@ -5034,6 +5041,40 @@ restart:
                    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
                    va, origpte, newpte));
                pmap_store(l2p, newpte);
+       } else /* (psind == 1) */ {
+               KASSERT(pagesizes[psind] == L3C_SIZE,
+                   ("pagesizes[%d] != L3C_SIZE", psind));
+               l2p = pmap_l2(pmap, va);
+               if (l2p == NULL || (pmap_load(l2p) & ATTR_DESCR_VALID) == 0) {
+                       mp = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), NULL);
+                       if (mp == NULL) {
+                               if ((flags & PMAP_ENTER_NOSLEEP) != 0)
+                                       return (KERN_RESOURCE_SHORTAGE);
+                               PMAP_UNLOCK(pmap);
+                               vm_wait(NULL);
+                               PMAP_LOCK(pmap);
+                               goto restart;
+                       }
+                       mp->ref_count += L3C_ENTRIES - 1;
+                       l3p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
+                       l3p = &l3p[pmap_l3_index(va)];
+               } else {
+                       l3p = pmap_l2_to_l3(l2p, va);
+                       if ((pmap_load(l3p) & ATTR_DESCR_VALID) == 0) {
+                               mp = PTE_TO_VM_PAGE(pmap_load(l2p));
+                               mp->ref_count += L3C_ENTRIES;
+                       }
+               }
+               for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
+                       origpte = pmap_load(tl3p);
+                       KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
+                           ((origpte & ATTR_CONTIGUOUS) != 0 &&
+                           PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
+                           ("va %#lx changing 64K phys page l3 %#lx newpte 
%#lx",
+                           va, origpte, newpte));
+                       pmap_store(tl3p, newpte);
+                       newpte += L3_SIZE;
+               }
        }
        dsb(ishst);
 
@@ -5072,7 +5113,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 
vm_prot_t prot,
        vm_paddr_t opa, pa;
        vm_page_t mpte, om;
        bool nosleep;
-       int lvl, rv;
+       int full_lvl, lvl, rv;
 
        KASSERT(ADDR_IS_CANONICAL(va),
            ("%s: Address not in canonical form: %lx", __func__, va));
@@ -5128,24 +5169,47 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, 
vm_prot_t prot,
        if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
                KASSERT((m->oflags & VPO_UNMANAGED) != 0,
                    ("managed largepage va %#lx flags %#x", va, flags));
-               new_l3 &= ~L3_PAGE;
-               if (psind == 2) {
+               if (psind == 3) {
                        PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
+                       new_l3 &= ~L3_PAGE;
                        new_l3 |= L1_BLOCK;
-               } else /* (psind == 1) */
+               } else if (psind == 2) {
+                       new_l3 &= ~L3_PAGE;
                        new_l3 |= L2_BLOCK;
+               } else /* (psind == 1) */
+                       new_l3 |= ATTR_CONTIGUOUS;
                rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
                goto out;
        }
-       if (psind == 1) {
+       if (psind == 2) {
                /* Assert the required virtual and physical alignment. */
                KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
-               KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
+               KASSERT(m->psind > 1, ("pmap_enter: m->psind < psind"));
                rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
                    flags, m, &lock);
                goto out;
        }
        mpte = NULL;
+       if (psind == 1) {
+               KASSERT((va & L3C_OFFSET) == 0, ("pmap_enter: va unaligned"));
+               KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
+               rv = pmap_enter_l3c(pmap, va, new_l3 | ATTR_CONTIGUOUS, flags,
+                   m, &mpte, &lock);
+#if VM_NRESERVLEVEL > 0
+               /*
+                * Attempt L2 promotion, if both the PTP and a level 1
+                * reservation are fully populated.
+                */
+               if (rv == KERN_SUCCESS &&
+                   (mpte == NULL || mpte->ref_count == NL3PG) &&
+                   (m->flags & PG_FICTITIOUS) == 0 &&
+                   vm_reserv_level_iffullpop(m) == 1) {
+                       pde = pmap_l2(pmap, va);
+                       (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
+               }
+#endif
+               goto out;
+       }
 
        /*
         * In the case that a page table page is not
@@ -5365,15 +5429,14 @@ validate:
         * are aligned with each other and an underlying reservation has the
         * neighboring L3 pages allocated.  The first condition is simply an
         * optimization that recognizes some eventual promotion failures early
-        * at a lower run-time cost.  Then, if both the page table page and
-        * the reservation are fully populated, attempt L2 promotion.
+        * at a lower run-time cost.  Then, if both a level 1 reservation and
+        * the PTP are fully populated, attempt L2 promotion.
         */
        if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
            (m->flags & PG_FICTITIOUS) == 0 &&
-           vm_reserv_is_populated(m, L3C_ENTRIES) &&
+           (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
            pmap_promote_l3c(pmap, l3, va) &&
-           (mpte == NULL || mpte->ref_count == NL3PG) &&
-           vm_reserv_level_iffullpop(m) == 0)
+           full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG))
                (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
 #endif
 
@@ -5667,6 +5730,8 @@ pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t 
l3e, u_int flags,
            ("pmap_enter_l3c: va is not aligned"));
        KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
            ("pmap_enter_l3c: managed mapping within the clean submap"));
+       KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
+           ("pmap_enter_l3c: l3e is missing ATTR_CONTIGUOUS"));
 
        /*
         * If the L3 PTP is not resident, we attempt to create it here.
@@ -5873,14 +5938,12 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, 
vm_offset_t end,
        while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
                va = start + ptoa(diff);
                if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
-                   m->psind == 1 && pmap_ps_enabled(pmap) &&
+                   m->psind == 2 && pmap_ps_enabled(pmap) &&
                    ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
                    KERN_SUCCESS || rv == KERN_NO_SPACE))
                        m = &m[L2_SIZE / PAGE_SIZE - 1];
                else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
-                   (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
-                   vm_reserv_is_populated(m, L3C_ENTRIES) &&
-                   pmap_ps_enabled(pmap) &&
+                   m->psind >= 1 && pmap_ps_enabled(pmap) &&
                    ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
                    &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
                        m = &m[L3C_ENTRIES - 1];
@@ -5932,7 +5995,7 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 
vm_page_t m,
 {
        pt_entry_t *l1, *l2, *l3, l3_val;
        vm_paddr_t pa;
-       int lvl;
+       int full_lvl, lvl;
 
        KASSERT(!VA_IS_CLEANMAP(va) ||
            (m->oflags & VPO_UNMANAGED) != 0,
@@ -6063,18 +6126,17 @@ pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 
vm_page_t m,
         * are aligned with each other and an underlying reservation has the
         * neighboring L3 pages allocated.  The first condition is simply an
         * optimization that recognizes some eventual promotion failures early
-        * at a lower run-time cost.  Then, attempt L2 promotion, if both the
-        * PTP and the reservation are fully populated.
+        * at a lower run-time cost.  Then, attempt L2 promotion, if both a
+        * level 1 reservation and the PTP are fully populated.
         */
        if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
            (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
            (m->flags & PG_FICTITIOUS) == 0 &&
-           vm_reserv_is_populated(m, L3C_ENTRIES) &&
+           (full_lvl = vm_reserv_level_iffullpop(m)) >= 0 &&
            pmap_promote_l3c(pmap, l3, va) &&
-           (mpte == NULL || mpte->ref_count == NL3PG) &&
-           vm_reserv_level_iffullpop(m) == 0) {
+           full_lvl == 1 && (mpte == NULL || mpte->ref_count == NL3PG)) {
                if (l2 == NULL)
-                       l2 = pmap_pde(pmap, va, &lvl);
+                       l2 = pmap_l2(pmap, va);
 
                /*
                 * If promotion succeeds, then the next call to this function
@@ -8566,7 +8628,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t 
*pap)
 {
        pt_entry_t *pte, tpte;
        vm_paddr_t mask, pa;
-       int lvl, val;
+       int lvl, psind, val;
        bool managed;
 
        PMAP_ASSERT_STAGE1(pmap);
@@ -8578,21 +8640,22 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t 
*pap)
                switch (lvl) {
                case 3:
                        mask = L3_OFFSET;
+                       psind = (tpte & ATTR_CONTIGUOUS) != 0 ? 1 : 0;
                        break;
                case 2:
                        mask = L2_OFFSET;
+                       psind = 2;
                        break;
                case 1:
                        mask = L1_OFFSET;
+                       psind = 3;
                        break;
                default:
                        panic("pmap_mincore: invalid level %d", lvl);
                }
 
                managed = (tpte & ATTR_SW_MANAGED) != 0;
-               val = MINCORE_INCORE;
-               if (lvl != 3)
-                       val |= MINCORE_PSIND(3 - lvl);
+               val = MINCORE_INCORE | MINCORE_PSIND(psind);
                if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
                    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
                        val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
@@ -9128,18 +9191,37 @@ pmap_align_superpage(vm_object_t object, vm_ooffset_t 
offset,
 {
        vm_offset_t superpage_offset;
 
-       if (size < L2_SIZE)
+       if (size < L3C_SIZE)
                return;
        if (object != NULL && (object->flags & OBJ_COLORED) != 0)
                offset += ptoa(object->pg_color);
+
+       /*
+        * Considering the object's physical alignment, is the mapping large
+        * enough to encompass an L2 (2MB/32MB) superpage ...
+        */
        superpage_offset = offset & L2_OFFSET;
-       if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
-           (*addr & L2_OFFSET) == superpage_offset)
+       if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) >= L2_SIZE) {
+               /*
+                * If the virtual and physical alignments differ, then
+                * increase the virtual address so that the alignments match.
+                */
+               if ((*addr & L2_OFFSET) < superpage_offset)
+                       *addr = (*addr & ~L2_OFFSET) + superpage_offset;
+               else if ((*addr & L2_OFFSET) > superpage_offset)
+                       *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) +
+                           superpage_offset;
                return;
-       if ((*addr & L2_OFFSET) < superpage_offset)
-               *addr = (*addr & ~L2_OFFSET) + superpage_offset;
-       else
-               *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
+       }
+       /* ... or an L3C (64KB/2MB) superpage? */
+       superpage_offset = offset & L3C_OFFSET;
+       if (size - ((L3C_SIZE - superpage_offset) & L3C_OFFSET) >= L3C_SIZE) {
+               if ((*addr & L3C_OFFSET) < superpage_offset)
+                       *addr = (*addr & ~L3C_OFFSET) + superpage_offset;
+               else if ((*addr & L3C_OFFSET) > superpage_offset)
+                       *addr = ((*addr + L3C_OFFSET) & ~L3C_OFFSET) +
+                           superpage_offset;
+       }
 }
 
 /**
diff --git a/sys/arm64/include/param.h b/sys/arm64/include/param.h
index ca3fae11c515..6eb58a69dba1 100644
--- a/sys/arm64/include/param.h
+++ b/sys/arm64/include/param.h
@@ -97,7 +97,7 @@
 #define        PAGE_SIZE       (1 << PAGE_SHIFT)
 #define        PAGE_MASK       (PAGE_SIZE - 1)
 
-#define        MAXPAGESIZES    3               /* maximum number of supported 
page sizes */
+#define        MAXPAGESIZES    4               /* maximum number of supported 
page sizes */
 
 #ifndef KSTACK_PAGES
 #if defined(KASAN) || defined(KMSAN)
diff --git a/sys/arm64/include/vmparam.h b/sys/arm64/include/vmparam.h
index 28b8380fc7c1..349849845e73 100644
--- a/sys/arm64/include/vmparam.h
+++ b/sys/arm64/include/vmparam.h
@@ -114,25 +114,34 @@
 #endif
 
 /*
- * Enable superpage reservations: 1 level.
+ * Enable superpage reservations: 2 levels.
  */
 #ifndef        VM_NRESERVLEVEL
-#define        VM_NRESERVLEVEL         1
+#define        VM_NRESERVLEVEL         2
 #endif
 
 /*
- * Level 0 reservations consist of 512 pages when PAGE_SIZE is 4KB, and
- * 2048 pages when PAGE_SIZE is 16KB.
+ * Level 0 reservations consist of 16 pages when PAGE_SIZE is 4KB, and 128
+ * pages when PAGE_SIZE is 16KB.  Level 1 reservations consist of 32 64KB
+ * pages when PAGE_SIZE is 4KB, and 16 2M pages when PAGE_SIZE is 16KB.
  */
-#ifndef        VM_LEVEL_0_ORDER
 #if PAGE_SIZE == PAGE_SIZE_4K
-#define        VM_LEVEL_0_ORDER        9
+#ifndef        VM_LEVEL_0_ORDER
+#define        VM_LEVEL_0_ORDER        4
+#endif
+#ifndef        VM_LEVEL_1_ORDER
+#define        VM_LEVEL_1_ORDER        5
+#endif
 #elif PAGE_SIZE == PAGE_SIZE_16K
-#define        VM_LEVEL_0_ORDER        11
+#ifndef        VM_LEVEL_0_ORDER
+#define        VM_LEVEL_0_ORDER        7
+#endif
+#ifndef        VM_LEVEL_1_ORDER
+#define        VM_LEVEL_1_ORDER        4
+#endif
 #else
 #error Unsupported page size
 #endif
-#endif
 
 /**
  * Address space layout.
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 4cee366ed747..a623a63e9c2e 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -1360,8 +1360,12 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params 
*imgp)
        if ((map->flags & MAP_ASLR) != 0) {
                maxv1 = maxv / 2 + addr / 2;
                error = __CONCAT(rnd_, __elfN(base))(map, addr, maxv1,
-                   (MAXPAGESIZES > 1 && pagesizes[1] != 0) ?
-                   pagesizes[1] : pagesizes[0], &anon_loc);
+#if VM_NRESERVLEVEL > 0
+                   pagesizes[VM_NRESERVLEVEL] != 0 ?
+                   /* Align anon_loc to the largest superpage size. */
+                   pagesizes[VM_NRESERVLEVEL] :
+#endif
+                   pagesizes[0], &anon_loc);
                if (error != 0)
                        goto ret;
                map->anon_loc = anon_loc;
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
index e4f557a3461d..5724ed3f6932 100644
--- a/sys/kern/kern_mib.c
+++ b/sys/kern/kern_mib.c
@@ -58,6 +58,8 @@
 #include <sys/systm.h>
 #include <sys/unistd.h>
 
+#include <vm/vm_param.h>
+
 SYSCTL_ROOT_NODE(0, sysctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sysctl internal magic");
 SYSCTL_ROOT_NODE(CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, 
0,
@@ -242,7 +244,11 @@ SYSCTL_PROC(_hw, HW_USERMEM, usermem,
 SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0,
     "Amount of physical memory (in pages)");
 
-u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
+#if VM_NRESERVLEVEL > 0
+_Static_assert(MAXPAGESIZES > VM_NRESERVLEVEL, "MAXPAGESIZES is too small");
+#endif
+
+u_long __read_mostly pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
 
 static int
 sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 2ecc82026b3f..52b361832218 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -2542,6 +2542,7 @@ kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t 
entry,
        vm_offset_t addr;
        vm_paddr_t pa;
        vm_pindex_t pi, pi_adv, pindex;
+       int incore;
 
        *super = false;
        *resident_count = 0;
@@ -2577,10 +2578,15 @@ kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t 
entry,
                }
                m_adv = NULL;
                if (m->psind != 0 && addr + pagesizes[1] <= entry->end &&
-                   (addr & (pagesizes[1] - 1)) == 0 &&
-                   (pmap_mincore(map->pmap, addr, &pa) & MINCORE_SUPER) != 0) {
+                   (addr & (pagesizes[1] - 1)) == 0 && (incore =
+                   pmap_mincore(map->pmap, addr, &pa) & MINCORE_SUPER) != 0) {
                        *super = true;
-                       pi_adv = atop(pagesizes[1]);
+                       /*
+                        * The virtual page might be smaller than the physical
+                        * page, so we use the page size reported by the pmap
+                        * rather than m->psind.
+                        */
+                       pi_adv = atop(pagesizes[incore >> MINCORE_PSIND_SHIFT]);
                } else {
                        /*
                         * We do not test the found page on validity.
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index c83455d1e70b..7672ded459df 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1589,9 +1589,20 @@ shm_mmap_large(struct shmfd *shmfd, vm_map_t map, 
vm_offset_t *addr,
        if (align == 0) {
                align = pagesizes[shmfd->shm_lp_psind];
        } else if (align == MAP_ALIGNED_SUPER) {
-               if (shmfd->shm_lp_psind != 1)
+               /*
+                * MAP_ALIGNED_SUPER is only supported on superpage sizes,
+                * i.e., [1, VM_NRESERVLEVEL].  shmfd->shm_lp_psind < 1 is
+                * handled above.
+                */
+               if (
+#if VM_NRESERVLEVEL > 0
+                   shmfd->shm_lp_psind > VM_NRESERVLEVEL
+#else
+                   shmfd->shm_lp_psind > 1
+#endif
+                   )
                        return (EINVAL);
-               align = pagesizes[1];
+               align = pagesizes[shmfd->shm_lp_psind];
        } else {
                align >>= MAP_ALIGNMENT_SHIFT;
                align = 1ULL << align;
diff --git a/sys/sys/mman.h b/sys/sys/mman.h
index cc840fb96d8e..8feba252e2fe 100644
--- a/sys/sys/mman.h
+++ b/sys/sys/mman.h
@@ -175,7 +175,9 @@
 #define        MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */
 #define        MINCORE_MODIFIED_OTHER  0x10 /* Page has been modified */
 #define        MINCORE_SUPER           0x60 /* Page is a "super" page */
-#define        MINCORE_PSIND(i)        (((i) << 5) & MINCORE_SUPER) /* Page 
size */
+#define        MINCORE_PSIND_SHIFT     5
+#define        MINCORE_PSIND(i)        (((i) << MINCORE_PSIND_SHIFT) & 
MINCORE_SUPER)
+                                    /* Page size */
 
 /*
  * Anonymous object constant for shm_open().
diff --git a/sys/vm/vm_domainset.c b/sys/vm/vm_domainset.c
index b881466bffe5..f6ac3c984cbf 100644
--- a/sys/vm/vm_domainset.c
+++ b/sys/vm/vm_domainset.c
@@ -77,6 +77,9 @@ vm_domainset_iter_init(struct vm_domainset_iter *di, struct 
domainset *ds,
                         * reservation boundary.
                         */
                        pindex += obj->pg_color;
+#if VM_NRESERVLEVEL > 1
+                       pindex >>= VM_LEVEL_1_ORDER;
+#endif
                        pindex >>= VM_LEVEL_0_ORDER;
                } else
 #endif
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index 3da411643f26..df686f3e46dc 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -380,12 +380,10 @@ vm_fault_soft_fast(struct faultstate *fs)
        psind = 0;
 #if VM_NRESERVLEVEL > 0
        if ((m->flags & PG_FICTITIOUS) == 0 &&
-           (m_super = vm_reserv_to_superpage(m)) != NULL &&
-           rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
-           roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
-           (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
-           (pagesizes[m_super->psind] - 1)) &&
-           pmap_ps_enabled(fs->map->pmap)) {
+           (m_super = vm_reserv_to_superpage(m)) != NULL) {
+               psind = m_super->psind;
+               KASSERT(psind > 0,
+                   ("psind %d of m_super %p < 1", psind, m_super));
                flags = PS_ALL_VALID;
                if ((fs->prot & VM_PROT_WRITE) != 0) {
                        /*
@@ -398,9 +396,23 @@ vm_fault_soft_fast(struct faultstate *fs)
                        if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
                                flags |= PS_ALL_DIRTY;
                }
-               if (vm_page_ps_test(m_super, flags, m)) {
+               while (rounddown2(vaddr, pagesizes[psind]) < fs->entry->start ||
+                   roundup2(vaddr + 1, pagesizes[psind]) > fs->entry->end ||
+                   (vaddr & (pagesizes[psind] - 1)) !=
+                   (VM_PAGE_TO_PHYS(m) & (pagesizes[psind] - 1)) ||
+                   !vm_page_ps_test(m_super, psind, flags, m) ||
+                   !pmap_ps_enabled(fs->map->pmap)) {
+                       psind--;
+                       if (psind == 0)
+                               break;
+                       m_super += rounddown2(m - m_super,
+                           atop(pagesizes[psind]));
+                       KASSERT(m_super->psind >= psind,
+                           ("psind %d of m_super %p < %d", m_super->psind,
+                           m_super, psind));
+               }
+               if (psind > 0) {
                        m_map = m_super;
-                       psind = m_super->psind;
                        vaddr = rounddown2(vaddr, pagesizes[psind]);
                        /* Preset the modified bit for dirty superpages. */
                        if ((flags & PS_ALL_DIRTY) != 0)
@@ -615,10 +627,10 @@ vm_fault_populate(struct faultstate *fs)
                vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
 
                psind = m->psind;
-               if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
+               while (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
                    pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||
                    !pmap_ps_enabled(fs->map->pmap)))
-                       psind = 0;
+                       psind--;
 
                npages = atop(pagesizes[psind]);
                for (i = 0; i < npages; i++) {
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 641800413e68..4f8121fa1064 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -106,7 +106,10 @@
 
 #include <machine/cpu.h>
 
-#if VM_NRESERVLEVEL > 0
+#if VM_NRESERVLEVEL > 1
+#define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \
+    PAGE_SHIFT)
+#elif VM_NRESERVLEVEL > 0
 #define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
 #else
 #define KVA_KSTACK_QUANTUM_SHIFT (8 + PAGE_SHIFT)
diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 1ef3154845b3..a04044463fe2 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -120,7 +120,10 @@ SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
 #endif
     "Max kernel address");
 
-#if VM_NRESERVLEVEL > 0
+#if VM_NRESERVLEVEL > 1
+#define        KVA_QUANTUM_SHIFT       (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \
+    PAGE_SHIFT)
+#elif VM_NRESERVLEVEL > 0
 #define        KVA_QUANTUM_SHIFT       (VM_LEVEL_0_ORDER + PAGE_SHIFT)
 #else
 /* On non-superpage architectures we want large import sizes. */
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index fa71bb8a01d6..b9c27e14d1d0 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -1993,8 +1993,15 @@ out:
        return (result);
 }
 
+#if VM_NRESERVLEVEL == 1
 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
+#elif VM_NRESERVLEVEL == 2
+static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10};
+static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4};
+#else
+#error "Unsupported VM_NRESERVLEVEL"
+#endif
 
 static int cluster_anon = 1;
 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
@@ -2190,9 +2197,23 @@ again:
                         * Find space for allocation, including
                         * gap needed for later randomization.
                         */
-                       pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
-                           (find_space == VMFS_SUPER_SPACE || find_space ==
-                           VMFS_OPTIMAL_SPACE) ? 1 : 0;
+                       pidx = 0;
+#if VM_NRESERVLEVEL > 0
+                       if ((find_space == VMFS_SUPER_SPACE ||
+                           find_space == VMFS_OPTIMAL_SPACE) &&
+                           pagesizes[VM_NRESERVLEVEL] != 0) {
+                               /*
+                                * Do not pointlessly increase the space that
+                                * is requested from vm_map_findspace().
+                                * pmap_align_superpage() will only change a
+                                * mapping's alignment if that mapping is at
+                                * least a superpage in size.
+                                */
+                               pidx = VM_NRESERVLEVEL;
+                               while (pidx > 0 && length < pagesizes[pidx])
+                                       pidx--;
+                       }
+#endif
                        gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
                            (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
                            aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
@@ -2656,6 +2677,7 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, 
vm_prot_t prot,
        vm_offset_t start;
        vm_page_t p, p_start;
        vm_pindex_t mask, psize, threshold, tmpidx;
+       int psind;
 
        if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
                return;
@@ -2710,13 +2732,17 @@ vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, 
vm_prot_t prot,
                                p_start = p;
                        }
                        /* Jump ahead if a superpage mapping is possible. */
-                       if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
-                           (pagesizes[p->psind] - 1)) == 0) {
-                               mask = atop(pagesizes[p->psind]) - 1;
-                               if (tmpidx + mask < psize &&
-                                   vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
-                                       p += mask;
-                                       threshold += mask;
+                       for (psind = p->psind; psind > 0; psind--) {
+                               if (((addr + ptoa(tmpidx)) &
+                                   (pagesizes[psind] - 1)) == 0) {
+                                       mask = atop(pagesizes[psind]) - 1;
+                                       if (tmpidx + mask < psize &&
+                                           vm_page_ps_test(p, psind,
+                                           PS_ALL_VALID, NULL)) {
+                                               p += mask;
+                                               threshold += mask;
+                                               break;
+                                       }
                                }
                        }
                } else if (p_start != NULL) {
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index e343170eff6a..c9ac79330696 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -5562,7 +5562,7 @@ vm_page_is_valid(vm_page_t m, int base, int size)
  * (super)page and false otherwise.
  */
 bool
-vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
+vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m)
 {
        vm_object_t object;
        int i, npages;
@@ -5571,7 +5571,9 @@ vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
        if (skip_m != NULL && skip_m->object != object)
                return (false);
        VM_OBJECT_ASSERT_LOCKED(object);
-       npages = atop(pagesizes[m->psind]);
+       KASSERT(psind <= m->psind,
+           ("psind %d > psind %d of m %p", psind, m->psind, m));
+       npages = atop(pagesizes[psind]);
 
        /*
         * The physically contiguous pages that make up a superpage, i.e., a
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 5422f8df89a0..f419ba8e3d34 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -657,7 +657,7 @@ void vm_page_pqbatch_submit(vm_page_t m, uint8_t queue);
 bool vm_page_pqstate_commit(vm_page_t m, vm_page_astate_t *old,
     vm_page_astate_t new);
 vm_page_t vm_page_prev(vm_page_t m);
-bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
+bool vm_page_ps_test(vm_page_t m, int psind, int flags, vm_page_t skip_m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
 int vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
diff --git a/sys/vm/vm_reserv.c b/sys/vm/vm_reserv.c
index a4bbccdfb428..aa20eca129fb 100644
--- a/sys/vm/vm_reserv.c
+++ b/sys/vm/vm_reserv.c
@@ -77,6 +77,29 @@
 
 #if VM_NRESERVLEVEL > 0
 
+/*
+ * Temporarily simulate two-level reservations.  Effectively, VM_LEVEL_0_* is
+ * level 1, and VM_SUBLEVEL_0_* is level 0.
+ */
+#if VM_NRESERVLEVEL == 2
+#undef VM_NRESERVLEVEL
+#define        VM_NRESERVLEVEL         1
+#if VM_LEVEL_0_ORDER == 4
+#undef VM_LEVEL_0_ORDER
+#define        VM_LEVEL_0_ORDER        (4 + VM_LEVEL_1_ORDER)
+#define        VM_SUBLEVEL_0_NPAGES    (1 << 4)
+#elif VM_LEVEL_0_ORDER == 7
+#undef VM_LEVEL_0_ORDER
+#define        VM_LEVEL_0_ORDER        (7 + VM_LEVEL_1_ORDER)
+#define        VM_SUBLEVEL_0_NPAGES    (1 << 7)
+#else
+#error "Unsupported level 0 reservation size"
+#endif
+#define        VM_LEVEL_0_PSIND        2
+#else
+#define        VM_LEVEL_0_PSIND        1
+#endif
+
 #ifndef VM_LEVEL_0_ORDER_MAX
 #define        VM_LEVEL_0_ORDER_MAX    VM_LEVEL_0_ORDER
 #endif
@@ -381,6 +404,27 @@ vm_reserv_insert(vm_reserv_t rv, vm_object_t object, 
vm_pindex_t pindex)
        vm_reserv_object_unlock(object);
 }
 
+#ifdef VM_SUBLEVEL_0_NPAGES
+static inline bool
+vm_reserv_is_sublevel_full(vm_reserv_t rv, int index)
+{
+       _Static_assert(VM_SUBLEVEL_0_NPAGES == 16 ||
+           VM_SUBLEVEL_0_NPAGES == 128,
+           "vm_reserv_is_sublevel_full: unsupported VM_SUBLEVEL_0_NPAGES");
+       /* An equivalent bit_ntest() compiles to more instructions. */
+       switch (VM_SUBLEVEL_0_NPAGES) {
+       case 16:
+               return (((uint16_t *)rv->popmap)[index / 16] == UINT16_MAX);
+       case 128:
+               index = rounddown2(index, 128) / 64;
+               return (((uint64_t *)rv->popmap)[index] == UINT64_MAX &&
+                   ((uint64_t *)rv->popmap)[index + 1] == UINT64_MAX);
+       default:
+               __unreachable();
+       }
+}
+#endif
+
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
@@ -406,11 +450,15 @@ vm_reserv_depopulate(vm_reserv_t rv, int index)
            ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
            rv, rv->domain));
        if (rv->popcnt == VM_LEVEL_0_NPAGES) {
-               KASSERT(rv->pages->psind == 1,
+               KASSERT(rv->pages->psind == VM_LEVEL_0_PSIND,
                    ("vm_reserv_depopulate: reserv %p is already demoted",
                    rv));
-               rv->pages->psind = 0;
+               rv->pages->psind = VM_LEVEL_0_PSIND - 1;
        }
+#ifdef VM_SUBLEVEL_0_NPAGES
+       if (vm_reserv_is_sublevel_full(rv, index))
+               rv->pages[rounddown2(index, VM_SUBLEVEL_0_NPAGES)].psind = 0;
+#endif
        bit_clear(rv->popmap, index);
        rv->popcnt--;
        if ((unsigned)(ticks - rv->lasttick) >= PARTPOPSLOP ||
@@ -522,12 +570,17 @@ vm_reserv_populate(vm_reserv_t rv, int index)
            index));
        KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
            ("vm_reserv_populate: reserv %p is already full", rv));
-       KASSERT(rv->pages->psind == 0,
+       KASSERT(rv->pages->psind >= 0 &&
+           rv->pages->psind < VM_LEVEL_0_PSIND,
            ("vm_reserv_populate: reserv %p is already promoted", rv));
        KASSERT(rv->domain < vm_ndomains,
            ("vm_reserv_populate: reserv %p's domain is corrupted %d",
            rv, rv->domain));
        bit_set(rv->popmap, index);
+#ifdef VM_SUBLEVEL_0_NPAGES
+       if (vm_reserv_is_sublevel_full(rv, index))
+               rv->pages[rounddown2(index, VM_SUBLEVEL_0_NPAGES)].psind = 1;
+#endif
        rv->popcnt++;
        if ((unsigned)(ticks - rv->lasttick) < PARTPOPSLOP &&
            rv->inpartpopq && rv->popcnt != VM_LEVEL_0_NPAGES)
@@ -542,10 +595,10 @@ vm_reserv_populate(vm_reserv_t rv, int index)
                rv->inpartpopq = TRUE;
                TAILQ_INSERT_TAIL(&vm_rvd[rv->domain].partpop, rv, partpopq);
        } else {
-               KASSERT(rv->pages->psind == 0,
+               KASSERT(rv->pages->psind == VM_LEVEL_0_PSIND - 1,
                    ("vm_reserv_populate: reserv %p is already promoted",
                    rv));
-               rv->pages->psind = 1;
+               rv->pages->psind = VM_LEVEL_0_PSIND;
        }
        vm_reserv_domain_unlock(rv->domain);
 }
@@ -889,13 +942,18 @@ out:
 static void
 vm_reserv_break(vm_reserv_t rv)
 {
+       vm_page_t m;
        int hi, lo, pos;
 
        vm_reserv_assert_locked(rv);
        CTR5(KTR_VM, "%s: rv %p object %p popcnt %d inpartpop %d",
            __FUNCTION__, rv, rv->object, rv->popcnt, rv->inpartpopq);
        vm_reserv_remove(rv);
-       rv->pages->psind = 0;
+       m = rv->pages;
+#ifdef VM_SUBLEVEL_0_NPAGES
+       for (; m < rv->pages + VM_LEVEL_0_NPAGES; m += VM_SUBLEVEL_0_NPAGES)
+#endif
+               m->psind = 0;
        hi = lo = -1;
        pos = 0;
        for (;;) {
@@ -1089,7 +1147,11 @@ vm_reserv_level(vm_page_t m)
        vm_reserv_t rv;
 
        rv = vm_reserv_from_page(m);
+#ifdef VM_SUBLEVEL_0_NPAGES
+       return (rv->object != NULL ? 1 : -1);
+#else
        return (rv->object != NULL ? 0 : -1);
+#endif
 }
 
 /*
@@ -1102,7 +1164,15 @@ vm_reserv_level_iffullpop(vm_page_t m)
        vm_reserv_t rv;
 
        rv = vm_reserv_from_page(m);
-       return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
+       if (rv->popcnt == VM_LEVEL_0_NPAGES) {
+#ifdef VM_SUBLEVEL_0_NPAGES
+               return (1);
+       } else if (rv->pages != NULL &&
+           vm_reserv_is_sublevel_full(rv, m - rv->pages)) {
+#endif
+               return (0);
+       }
+       return (-1);
 }
 
 /*
@@ -1357,6 +1427,10 @@ vm_reserv_size(int level)
 
        switch (level) {
        case 0:
+#ifdef VM_SUBLEVEL_0_NPAGES
+               return (VM_SUBLEVEL_0_NPAGES * PAGE_SIZE);
+       case 1:
+#endif
                return (VM_LEVEL_0_SIZE);
        case -1:
                return (PAGE_SIZE);
@@ -1432,12 +1506,16 @@ vm_reserv_to_superpage(vm_page_t m)
 
        VM_OBJECT_ASSERT_LOCKED(m->object);
        rv = vm_reserv_from_page(m);
-       if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
-               m = rv->pages;
-       else
-               m = NULL;
-
-       return (m);
+       if (rv->object == m->object) {
+               if (rv->popcnt == VM_LEVEL_0_NPAGES)
+                       return (rv->pages);
+#ifdef VM_SUBLEVEL_0_NPAGES
+               if (vm_reserv_is_sublevel_full(rv, m - rv->pages))
+                       return (rv->pages + rounddown2(m - rv->pages,
+                           VM_SUBLEVEL_0_NPAGES));
+#endif
+       }
+       return (NULL);
 }
 
 #endif /* VM_NRESERVLEVEL > 0 */

Reply via email to