The existing pagetable walker has complicated return semantics, which squeeze multiple pieces of information into single integer. This would be fine if the information didn't overlap, but it does.
Specifically, _PAGE_INVALID_BITS for 3-level guests alias _PAGE_PAGED and _PAGE_SHARED. A guest which constructs a PTE with bits 52 or 53 set (the start of the upper software-available range) will create a virtual address which, when walked by Xen, tricks Xen into believing the frame is paged or shared. This behaviour was introduced by XSA-173 (c/s 8b17648). It is also complicated to turn rc back into a normal pagefault error code. Instead, change the calling semantics to return a boolean indicating success, and have the function accumulate a real pagefault error code as it goes (including synthetic error codes, which do not alias hardware ones). This requires an equivalent adjustment to map_domain_gfn(). Issues fixed: * 2-level PSE36 superpages now return the correct translation. * 2-level L2 superpages without CR0.PSE now return the correct translation. * SMEP now inhibits a user instruction fetch even if NX isn't active. * Supervisor writes without CR0.WP now set the leaf dirty bit. * L4e._PAGE_GLOBAL is strictly reserved on AMD. * 3-level l3 entries have all reserved bits checked. * 3-level entries can no longer alias Xen's idea of paged or shared. Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com> --- CC: Jan Beulich <jbeul...@suse.com> CC: Tim Deegan <t...@xen.org> CC: George Dunlap <george.dun...@eu.citrix.com> --- xen/arch/x86/mm/guest_walk.c | 448 +++++++++++++++++++------------------- xen/arch/x86/mm/hap/guest_walk.c | 21 +- xen/arch/x86/mm/hap/nested_ept.c | 2 +- xen/arch/x86/mm/p2m.c | 17 +- xen/arch/x86/mm/shadow/multi.c | 27 +-- xen/include/asm-x86/guest_pt.h | 63 ++++-- xen/include/asm-x86/p2m.h | 2 +- xen/include/asm-x86/page.h | 3 - xen/include/asm-x86/processor.h | 2 + xen/include/asm-x86/x86_64/page.h | 6 - 10 files changed, 303 insertions(+), 288 deletions(-) diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c index 4f8d0e3..01bcb1d 100644 --- a/xen/arch/x86/mm/guest_walk.c +++ b/xen/arch/x86/mm/guest_walk.c @@ -32,44 +32,6 @@ asm(".file \"" __OBJECT_FILE__ "\""); #include <asm/page.h> #include <asm/guest_pt.h> -extern const uint32_t gw_page_flags[]; -#if GUEST_PAGING_LEVELS == CONFIG_PAGING_LEVELS -const uint32_t gw_page_flags[] = { - /* I/F - Usr Wr */ - /* 0 0 0 0 */ _PAGE_PRESENT, - /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 0 1 0 0 */ _PAGE_PRESENT, - /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, -}; -#endif - -/* Flags that are needed in a pagetable entry, with the sense of NX inverted */ -static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) -{ - /* Don't demand not-NX if the CPU wouldn't enforce it. */ - if ( !guest_supports_nx(v) ) - pfec &= ~PFEC_insn_fetch; - - /* Don't demand R/W if the CPU wouldn't enforce it. */ - if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) - && !(pfec & PFEC_user_mode) ) - pfec &= ~PFEC_write_access; - - return gw_page_flags[(pfec & 0x1f) >> 1] | _PAGE_INVALID_BITS; -} - /* Modify a guest pagetable entry to set the Accessed and Dirty bits. * Returns non-zero if it actually writes to guest memory. */ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) @@ -90,62 +52,33 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) return 0; } -#if GUEST_PAGING_LEVELS >= 4 -static bool_t pkey_fault(struct vcpu *vcpu, uint32_t pfec, - uint32_t pte_flags, uint32_t pte_pkey) -{ - uint32_t pkru; - - /* When page isn't present, PKEY isn't checked. */ - if ( !(pfec & PFEC_page_present) || is_pv_vcpu(vcpu) ) - return 0; - - /* - * PKU: additional mechanism by which the paging controls - * access to user-mode addresses based on the value in the - * PKRU register. A fault is considered as a PKU violation if all - * of the following conditions are true: - * 1.CR4_PKE=1. - * 2.EFER_LMA=1. - * 3.Page is present with no reserved bit violations. - * 4.The access is not an instruction fetch. - * 5.The access is to a user page. - * 6.PKRU.AD=1 or - * the access is a data write and PKRU.WD=1 and - * either CR0.WP=1 or it is a user access. - */ - if ( !hvm_pku_enabled(vcpu) || - !hvm_long_mode_enabled(vcpu) || - (pfec & PFEC_reserved_bit) || - (pfec & PFEC_insn_fetch) || - !(pte_flags & _PAGE_USER) ) - return 0; - - pkru = read_pkru(); - if ( unlikely(pkru) ) - { - bool_t pkru_ad = read_pkru_ad(pkru, pte_pkey); - bool_t pkru_wd = read_pkru_wd(pkru, pte_pkey); - - /* Condition 6 */ - if ( pkru_ad || - (pkru_wd && (pfec & PFEC_write_access) && - (hvm_wp_enabled(vcpu) || (pfec & PFEC_user_mode))) ) - return 1; - } - - return 0; -} -#endif - -/* Walk the guest pagetables, after the manner of a hardware walker. */ -/* Because the walk is essentially random, it can cause a deadlock - * warning in the p2m locking code. Highly unlikely this is an actual - * deadlock, because who would walk page table in the opposite order? */ -uint32_t +/* + * Walk the guest pagetables, after the manner of a hardware walker. + * + * This is a condensing of the 'Paging' chapters from Intel and AMD software + * manuals. Please refer closely to them. + * + * A pagetable walk consists of two parts: + * 1) to find whether a translation exists, and + * 2) if a translation does exist, to check whether the translation's access + * rights permit the access. + * + * A translation is found by following the pagetable structure (starting at + * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set) + * which identifies the physical destination of the access. + * + * A translation from one level to the next exists if the PTE is both present + * and has no reserved bits set. If the pagewalk counters a situation where a + * translation does not exist, the walk stops at that point. + * + * The access rights (NX, User, RW bits) are collected as the walk progresses. + * If a translation exists, the accumulated access rights are compared to the + * requested walk, to see whether the access is permitted. + */ +bool guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, walk_t *gw, - uint32_t pfec, mfn_t top_mfn, void *top_map) + uint32_t walk, mfn_t top_mfn, void *top_map) { struct domain *d = v->domain; p2m_type_t p2mt; @@ -155,64 +88,44 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, guest_l3e_t *l3p = NULL; guest_l4e_t *l4p; #endif - unsigned int pkey; - uint32_t gflags, mflags, iflags, rc = 0; - bool_t smep = 0, smap = 0; + uint32_t gflags, rc; bool_t pse1G = 0, pse2M = 0; p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE; - /* Only implicit supervisor data accesses exist. */ - ASSERT(!(pfec & PFEC_implicit) || - !(pfec & (PFEC_insn_fetch|PFEC_user_mode))); +#define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW) +#define AR_ACCUM_OR (_PAGE_NX_BIT) + /* Start with all AND bits set, all OR bits clear. */ + uint32_t ar, ar_and = ~0u, ar_or = 0; - perfc_incr(guest_walk); - memset(gw, 0, sizeof(*gw)); - gw->va = va; - - /* Mandatory bits that must be set in every entry. We invert NX and - * the invalid bits, to calculate as if there were an "X" bit that - * allowed access. We will accumulate, in rc, the set of flags that - * are missing/unwanted. */ - mflags = mandatory_flags(v, pfec); - iflags = (_PAGE_NX_BIT | _PAGE_INVALID_BITS); + bool walk_ok = false; - if ( is_hvm_domain(d) && !(pfec & PFEC_user_mode) ) - { - const struct cpu_user_regs *regs = guest_cpu_user_regs(); + /* + * TODO - We should ASSERT() that only the following bits are set as + * inputs to a guest walk, but a whole load of code currently passes in + * other PFEC_ constants. + */ + walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access); - /* SMEP: kernel-mode instruction fetches from user-mode mappings - * should fault. Unlike NX or invalid bits, we're looking for _all_ - * entries in the walk to have _PAGE_USER set, so we need to do the - * whole walk as if it were a user-mode one and then invert the answer. */ - smep = hvm_smep_enabled(v) && (pfec & PFEC_insn_fetch); + /* Only implicit supervisor data accesses exist. */ + ASSERT(!(walk & PFEC_implicit) || + !(walk & (PFEC_insn_fetch|PFEC_user_mode))); - switch ( v->arch.smap_check_policy ) - { - case SMAP_CHECK_HONOR_CPL_AC: - /* - * SMAP: kernel-mode data accesses from user-mode mappings - * should fault. - * A fault is considered as a SMAP violation if the following - * conditions come true: - * - X86_CR4_SMAP is set in CR4 - * - A user page is accessed - * - CPL = 3 or X86_EFLAGS_AC is clear - * - Page fault in kernel mode - */ - smap = hvm_smap_enabled(v) && - ((hvm_get_cpl(v) == 3) || !(regs->_eflags & X86_EFLAGS_AC)); - break; - case SMAP_CHECK_ENABLED: - smap = hvm_smap_enabled(v); - break; - default: - ASSERT(v->arch.smap_check_policy == SMAP_CHECK_DISABLED); - break; - } - } + /* + * PFEC_insn_fetch is only used as an input to pagetable walking if NX or + * SMEP are enabled. Otherwise, instruction fetches are indistinguishable + * from data reads. + * + * This property can be demonstrated on real hardware by having NX and + * SMEP inactive, but SMAP active, and observing that EFLAGS.AC determines + * whether a pagefault occures for supervisor execution on user mappings. + */ + if ( !(guest_supports_nx(v) || guest_smep_enabled(v)) ) + walk &= ~PFEC_insn_fetch; - if ( smep || smap ) - mflags |= _PAGE_USER; + perfc_incr(guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + gw->pfec = walk & (PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access); #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ @@ -221,17 +134,20 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, gw->l4mfn = top_mfn; l4p = (guest_l4e_t *) top_map; gw->l4e = l4p[guest_l4_table_offset(va)]; - gflags = guest_l4e_get_flags(gw->l4e) ^ iflags; - if ( !(gflags & _PAGE_PRESENT) ) { - rc |= _PAGE_PRESENT; + gflags = guest_l4e_get_flags(gw->l4e); + if ( !(gflags & _PAGE_PRESENT) ) goto out; - } - if ( gflags & _PAGE_PSE ) + + /* Check for reserved bits. */ + if ( guest_l4e_rsvd_bits(v, gw->l4e) ) { - rc |= _PAGE_PSE | _PAGE_INVALID_BIT; + gw->pfec |= PFEC_reserved_bit | PFEC_page_present; goto out; } - rc |= ((gflags & mflags) ^ mflags); + + /* Accumulate l4e access rights. */ + ar_and &= gflags; + ar_or |= gflags; /* Map the l3 table */ l3p = map_domain_gfn(p2m, @@ -241,17 +157,28 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, qt, &rc); if(l3p == NULL) + { + gw->pfec |= rc & PFEC_synth_mask; goto out; + } + /* Get the l3e and check its flags*/ gw->l3e = l3p[guest_l3_table_offset(va)]; - pkey = guest_l3e_get_pkey(gw->l3e); - gflags = guest_l3e_get_flags(gw->l3e) ^ iflags; - if ( !(gflags & _PAGE_PRESENT) ) { - rc |= _PAGE_PRESENT; + gflags = guest_l3e_get_flags(gw->l3e); + if ( !(gflags & _PAGE_PRESENT) ) + goto out; + + /* Check for reserved bits, including possibly _PAGE_PSE. */ + if ( guest_l3e_rsvd_bits(v, gw->l3e) ) + { + gw->pfec |= PFEC_reserved_bit | PFEC_page_present; goto out; } - rc |= ((gflags & mflags) ^ mflags); + /* Accumulate l3e access rights. */ + ar_and &= gflags; + ar_or |= gflags; + pse1G = !!(gflags & _PAGE_PSE); if ( pse1G ) @@ -272,26 +199,25 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */ flags &= ~_PAGE_PAT; - if ( !guest_supports_1G_superpages(v) ) - rc |= _PAGE_PSE | _PAGE_INVALID_BIT; - if ( gfn_x(start) & GUEST_L3_GFN_MASK & ~0x1 ) - rc |= _PAGE_INVALID_BITS; - /* Increment the pfn by the right number of 4k pages. */ start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) + ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK)); gw->l1e = guest_l1e_from_gfn(start, flags); gw->l2mfn = gw->l1mfn = INVALID_MFN; - goto set_ad; + goto leaf; } #else /* PAE only... */ /* Get the l3e and check its flag */ gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)]; - if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) + gflags = guest_l3e_get_flags(gw->l3e); + if ( !(gflags & _PAGE_PRESENT) ) + goto out; + + if ( guest_l3e_rsvd_bits(v, gw->l3e) ) { - rc |= _PAGE_PRESENT; + gw->pfec |= PFEC_reserved_bit | PFEC_page_present; goto out; } @@ -305,7 +231,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, qt, &rc); if(l2p == NULL) + { + gw->pfec |= rc & PFEC_synth_mask; goto out; + } + /* Get the l2e */ gw->l2e = l2p[guest_l2_table_offset(va)]; @@ -318,15 +248,32 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, #endif /* All levels... */ - pkey = guest_l2e_get_pkey(gw->l2e); - gflags = guest_l2e_get_flags(gw->l2e) ^ iflags; - if ( !(gflags & _PAGE_PRESENT) ) { - rc |= _PAGE_PRESENT; + /* Check the l2e flags. */ + gflags = guest_l2e_get_flags(gw->l2e); + if ( !(gflags & _PAGE_PRESENT) ) + goto out; + + /* + * In 2-level paging without CR0.PSE, there are no reserved bits, and the + * PAT/PSE bit is ignored. + */ + if ( GUEST_PAGING_LEVELS == 2 && !guest_supports_superpages(v) ) + { + gw->l2e.l2 &= ~_PAGE_PSE; + gflags &= ~_PAGE_PSE; + } + /* else check for reserved bits, including possibly _PAGE_PSE. */ + else if ( guest_l2e_rsvd_bits(v, gw->l2e) ) + { + gw->pfec |= PFEC_reserved_bit | PFEC_page_present; goto out; } - rc |= ((gflags & mflags) ^ mflags); - pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); + /* Accumulate l2e access rights. */ + ar_and &= gflags; + ar_or |= gflags; + + pse2M = !!(gflags & _PAGE_PSE); if ( pse2M ) { @@ -334,7 +281,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, * no guest l1e. We make one up so that the propagation code * can generate a shadow l1 table. Start with the gfn of the * first 4k-page of the superpage. */ +#if GUEST_PAGING_LEVELS == 2 + gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT); +#else gfn_t start = guest_l2e_get_gfn(gw->l2e); +#endif /* Grant full access in the l1e, since all the guest entry's * access controls are enforced in the shadow l2e. */ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| @@ -348,70 +299,136 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */ flags &= ~_PAGE_PAT; - if ( gfn_x(start) & GUEST_L2_GFN_MASK & ~0x1 ) - rc |= _PAGE_INVALID_BITS; - - /* Increment the pfn by the right number of 4k pages. - * Mask out PAT and invalid bits. */ + /* Increment the pfn by the right number of 4k pages. */ start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) + guest_l1_table_offset(va)); +#if GUEST_PAGING_LEVELS == 2 + /* Wider than 32 bits if PSE36 superpage. */ + gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags; +#else gw->l1e = guest_l1e_from_gfn(start, flags); +#endif gw->l1mfn = INVALID_MFN; - } - else + goto leaf; + } + + /* Map the l1 table */ + l1p = map_domain_gfn(p2m, + guest_l2e_get_gfn(gw->l2e), + &gw->l1mfn, + &p2mt, + qt, + &rc); + if ( l1p == NULL ) { - /* Not a superpage: carry on and find the l1e. */ - l1p = map_domain_gfn(p2m, - guest_l2e_get_gfn(gw->l2e), - &gw->l1mfn, - &p2mt, - qt, - &rc); - if(l1p == NULL) + gw->pfec |= rc & PFEC_synth_mask; + goto out; + } + gw->l1e = l1p[guest_l1_table_offset(va)]; + gflags = guest_l1e_get_flags(gw->l1e); + if ( !(gflags & _PAGE_PRESENT) ) + goto out; + + /* Check for reserved bits. */ + if ( guest_l1e_rsvd_bits(v, gw->l1e) ) + { + gw->pfec |= PFEC_reserved_bit | PFEC_page_present; + goto out; + } + + /* Accumulate l1e access rights. */ + ar_and &= gflags; + ar_or |= gflags; + + leaf: + gw->pfec |= PFEC_page_present; + + /* + * The pagetable walk has returned a successful translation. Now check + * access rights to see whether the access should succeed. + */ + ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR); + + if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) ) + /* Requested an instruction fetch and found NX? Fail. */ + goto out; + + if ( walk & PFEC_user_mode ) /* Requested a user acess. */ + { + if ( !(ar & _PAGE_USER) ) + /* Got a supervisor walk? Unconditional fail. */ goto out; - gw->l1e = l1p[guest_l1_table_offset(va)]; - pkey = guest_l1e_get_pkey(gw->l1e); - gflags = guest_l1e_get_flags(gw->l1e) ^ iflags; - if ( !(gflags & _PAGE_PRESENT) ) { - rc |= _PAGE_PRESENT; + + if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) ) + /* Requested a write and only got a read? Fail. */ goto out; + } + else /* Requested a supervisor access. */ + { + if ( ar & _PAGE_USER ) /* Got a user walk. */ + { + if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) ) + /* User insn fetch and smep? Fail. */ + goto out; + + if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) && + ((walk & PFEC_implicit) || + !(guest_cpu_user_regs()->_eflags & X86_EFLAGS_AC)) ) + /* User data access and smap? Fail. */ + goto out; } - rc |= ((gflags & mflags) ^ mflags); + + if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) && + guest_wp_enabled(v) ) + /* Requested a write, got a read, and CR0.WP is set? Fail. */ + goto out; } #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ -set_ad: - if ( pkey_fault(v, pfec, gflags, pkey) ) - rc |= _PAGE_PKEY_BITS; + /* + * If all access checks are thusfar ok, check Protection Key for 64bit + * user data accesses. + * + * N.B. In the case that the walk ended with a superpage, the fabricated + * gw->l1e contains the appropriate leaf pkey. + */ + if ( (walk & PFEC_user_mode) && !(walk & PFEC_insn_fetch) && guest_pku_enabled(v) ) + { + unsigned int pkey = guest_l1e_get_pkey(gw->l1e); + unsigned int pkru = read_pkru(); + + if ( read_pkru_ad(pkru, pkey) || + ((ar & PFEC_write_access) && read_pkru_wd(pkru, pkey)) ) + { + gw->pfec |= PFEC_prot_key; + goto out; + } + } #endif - /* Now re-invert the user-mode requirement for SMEP and SMAP */ - if ( smep || smap ) - rc ^= _PAGE_USER; + + walk_ok = true; /* Go back and set accessed and dirty bits only if the walk was a * success. Although the PRMs say higher-level _PAGE_ACCESSED bits * get set whenever a lower-level PT is used, at least some hardware * walkers behave this way. */ - if ( rc == 0 ) - { #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ - if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) - paging_mark_dirty(d, gw->l4mfn); - if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, - (pse1G && (pfec & PFEC_write_access))) ) - paging_mark_dirty(d, gw->l3mfn); + if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) + paging_mark_dirty(d, gw->l4mfn); + if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, + (pse1G && (walk & PFEC_write_access))) ) + paging_mark_dirty(d, gw->l3mfn); #endif - if ( !pse1G ) + if ( !pse1G ) + { + if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, + (pse2M && (walk & PFEC_write_access))) ) + paging_mark_dirty(d, gw->l2mfn); + if ( !pse2M ) { - if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, - (pse2M && (pfec & PFEC_write_access))) ) - paging_mark_dirty(d, gw->l2mfn); - if ( !pse2M ) - { - if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, - (pfec & PFEC_write_access)) ) - paging_mark_dirty(d, gw->l1mfn); - } + if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, + (walk & PFEC_write_access)) ) + paging_mark_dirty(d, gw->l1mfn); } } @@ -436,10 +453,5 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, put_page(mfn_to_page(mfn_x(gw->l1mfn))); } - /* If this guest has a restricted physical address space then the - * target GFN must fit within it. */ - if ( !(rc & _PAGE_PRESENT) && !gfn_valid(d, guest_l1e_get_gfn(gw->l1e)) ) - rc |= _PAGE_INVALID_BITS; - - return rc; + return walk_ok; } diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c index 313f82f..97b8ac5 100644 --- a/xen/arch/x86/mm/hap/guest_walk.c +++ b/xen/arch/x86/mm/hap/guest_walk.c @@ -50,7 +50,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { - uint32_t missing; + bool walk_ok; mfn_t top_mfn; void *top_map; p2m_type_t p2mt; @@ -91,12 +91,12 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( #if GUEST_PAGING_LEVELS == 3 top_map += (cr3 & ~(PAGE_MASK | 31)); #endif - missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map); + walk_ok = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map); unmap_domain_page(top_map); put_page(top_page); /* Interpret the answer */ - if ( missing == 0 ) + if ( walk_ok ) { gfn_t gfn = guest_walk_to_gfn(&gw); struct page_info *page; @@ -123,20 +123,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( return gfn_x(gfn); } - if ( missing & _PAGE_PRESENT ) - pfec[0] &= ~PFEC_page_present; - - if ( missing & _PAGE_INVALID_BITS ) - pfec[0] |= PFEC_reserved_bit; - - if ( missing & _PAGE_PKEY_BITS ) - pfec[0] |= PFEC_prot_key; - - if ( missing & _PAGE_PAGED ) - pfec[0] = PFEC_page_paged; - - if ( missing & _PAGE_SHARED ) - pfec[0] = PFEC_page_shared; + pfec[0] = gw.pfec; out_tweak_pfec: /* diff --git a/xen/arch/x86/mm/hap/nested_ept.c b/xen/arch/x86/mm/hap/nested_ept.c index 02b27b1..14b1bb0 100644 --- a/xen/arch/x86/mm/hap/nested_ept.c +++ b/xen/arch/x86/mm/hap/nested_ept.c @@ -208,7 +208,7 @@ nept_walk_tables(struct vcpu *v, unsigned long l2ga, ept_walk_t *gw) goto out; map_err: - if ( rc == _PAGE_PAGED ) + if ( rc == PFEC_page_paged ) { ret = EPT_TRANSLATE_RETRY; goto out; diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 2eee9cd..b51421b 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -1774,17 +1774,18 @@ unsigned long paging_gva_to_gfn(struct vcpu *v, } /* - * If the map is non-NULL, we leave this function having - * acquired an extra ref on mfn_to_page(*mfn). + * If the map is non-NULL, we leave this function having acquired an extra ref + * on mfn_to_page(*mfn). In all cases, *pfec contains appropriate + * synthetic/structure PFEC_* bits. */ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn, - p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc) + p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec) { struct page_info *page; if ( !gfn_valid(p2m->domain, gfn) ) { - *rc = _PAGE_INVALID_BIT; + *pfec = PFEC_reserved_bit | PFEC_page_present; return NULL; } @@ -1796,21 +1797,23 @@ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn, if ( page ) put_page(page); p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); - *rc = _PAGE_PAGED; + *pfec = PFEC_page_paged; return NULL; } if ( p2m_is_shared(*p2mt) ) { if ( page ) put_page(page); - *rc = _PAGE_SHARED; + *pfec = PFEC_page_shared; return NULL; } if ( !page ) { - *rc |= _PAGE_PRESENT; + *pfec = 0; return NULL; } + + *pfec = PFEC_page_present; *mfn = page_to_mfn(page); ASSERT(mfn_valid(*mfn)); diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c index f73b553..62c1fb5 100644 --- a/xen/arch/x86/mm/shadow/multi.c +++ b/xen/arch/x86/mm/shadow/multi.c @@ -171,7 +171,7 @@ delete_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn) /**************************************************************************/ /* Functions for walking the guest page tables */ -static inline uint32_t +static inline bool sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec) { @@ -2865,6 +2865,7 @@ static int sh_page_fault(struct vcpu *v, int r; p2m_type_t p2mt; uint32_t rc, error_code; + bool walk_ok; int version; const struct npfec access = { .read_access = 1, @@ -3080,21 +3081,20 @@ static int sh_page_fault(struct vcpu *v, * shadow page table. */ version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); rmb(); - rc = sh_walk_guest_tables(v, va, &gw, error_code); + walk_ok = sh_walk_guest_tables(v, va, &gw, error_code); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) regs->error_code &= ~PFEC_page_present; - if ( !(rc & _PAGE_PRESENT) ) + if ( gw.pfec & PFEC_page_present ) regs->error_code |= PFEC_page_present; #endif - if ( rc != 0 ) + if ( !walk_ok ) { perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); reset_early_unshadow(v); - if ( (rc & _PAGE_INVALID_BITS) ) - regs->error_code |= PFEC_reserved_bit; + regs->error_code = gw.pfec & PFEC_arch_mask; goto propagate; } @@ -3728,7 +3728,7 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m, { walk_t gw; gfn_t gfn; - uint32_t missing; + bool walk_ok; #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Check the vTLB cache first */ @@ -3737,18 +3737,9 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m, return vtlb_gfn; #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ - if ( (missing = sh_walk_guest_tables(v, va, &gw, pfec[0])) != 0 ) + if ( !(walk_ok = sh_walk_guest_tables(v, va, &gw, pfec[0])) ) { - if ( (missing & _PAGE_PRESENT) ) - pfec[0] &= ~PFEC_page_present; - if ( missing & _PAGE_INVALID_BITS ) - pfec[0] |= PFEC_reserved_bit; - /* - * SDM Intel 64 Volume 3, Chapter Paging, PAGE-FAULT EXCEPTIONS: - * The PFEC_insn_fetch flag is set only when NX or SMEP are enabled. - */ - if ( is_hvm_vcpu(v) && !hvm_nx_enabled(v) && !hvm_smep_enabled(v) ) - pfec[0] &= ~PFEC_insn_fetch; + pfec[0] = gw.pfec; return gfn_x(INVALID_GFN); } gfn = guest_walk_to_gfn(&gw); diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h index 1aa383f..44fc097 100644 --- a/xen/include/asm-x86/guest_pt.h +++ b/xen/include/asm-x86/guest_pt.h @@ -236,19 +236,26 @@ static inline bool guest_supports_nx(const struct vcpu *v) return hvm_nx_enabled(v); } +static inline bool guest_wp_enabled(const struct vcpu *v) +{ + /* PV guests can't control CR0.WP, and it is unconditionally set by Xen. */ + return is_pv_vcpu(v) || hvm_wp_enabled(v); +} -/* - * Some bits are invalid in any pagetable entry. - * Normal flags values get represented in 24-bit values (see - * get_pte_flags() and put_pte_flags()), so set bit 24 in - * addition to be able to flag out of range frame numbers. - */ -#if GUEST_PAGING_LEVELS == 3 -#define _PAGE_INVALID_BITS \ - (_PAGE_INVALID_BIT | get_pte_flags(((1ull << 63) - 1) & ~(PAGE_SIZE - 1))) -#else /* 2-level and 4-level */ -#define _PAGE_INVALID_BITS _PAGE_INVALID_BIT -#endif +static inline bool guest_smep_enabled(const struct vcpu *v) +{ + return is_pv_vcpu(v) ? 0 : hvm_smep_enabled(v); +} + +static inline bool guest_smap_enabled(const struct vcpu *v) +{ + return is_pv_vcpu(v) ? 0 : hvm_smap_enabled(v); +} + +static inline bool guest_pku_enabled(const struct vcpu *v) +{ + return is_pv_vcpu(v) ? 0 : hvm_pku_enabled(v); +} /* Helpers for identifying whether guest entries have reserved bits set. */ @@ -312,13 +319,19 @@ struct guest_pagetable_walk guest_l3e_t l3e; /* Guest's level 3 entry */ #endif guest_l2e_t l2e; /* Guest's level 2 entry */ - guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */ + union + { + guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication). */ + uint64_t el1e; /* L2 PSE36 superpages wider than 32 bits. */ + }; #if GUEST_PAGING_LEVELS >= 4 mfn_t l4mfn; /* MFN that the level 4 entry was in */ mfn_t l3mfn; /* MFN that the level 3 entry was in */ #endif mfn_t l2mfn; /* MFN that the level 2 entry was in */ mfn_t l1mfn; /* MFN that the level 1 entry was in */ + + uint32_t pfec; /* Accumulated PFEC_* error code from walk. */ }; /* Given a walk_t, translate the gw->va into the guest's notion of the @@ -327,7 +340,9 @@ static inline gfn_t guest_walk_to_gfn(const walk_t *gw) { if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) return INVALID_GFN; - return guest_l1e_get_gfn(gw->l1e); + return (GUEST_PAGING_LEVELS == 2 + ? _gfn(gw->el1e >> PAGE_SHIFT) + : guest_l1e_get_gfn(gw->l1e)); } /* Given a walk_t, translate the gw->va into the guest's notion of the @@ -372,15 +387,16 @@ static inline unsigned int guest_walk_to_page_order(const walk_t *gw) * we go. For the purposes of reading pagetables we treat all non-RAM * memory as contining zeroes. * - * Returns 0 for success, or the set of permission bits that we failed on - * if the walk did not complete. */ + * Returns a boolean indicating success or failure. walk_t.pfec contains + * the accumulated error code on failure. + */ /* Macro-fu so you can call guest_walk_tables() and get the right one. */ #define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels #define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l) #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS) -extern uint32_t +bool guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map); @@ -400,8 +416,21 @@ static inline void print_gw(const walk_t *gw) #endif /* All levels... */ gprintk(XENLOG_INFO, " l2e=%" PRI_gpte " l2mfn=%" PRI_mfn "\n", gw->l2e.l2, mfn_x(gw->l2mfn)); +#if GUEST_PAGING_LEVELS == 2 + gprintk(XENLOG_INFO, " el1e=%08" PRIx64 " l1mfn=%" PRI_mfn "\n", + gw->el1e, mfn_x(gw->l1mfn)); +#else gprintk(XENLOG_INFO, " l1e=%" PRI_gpte " l1mfn=%" PRI_mfn "\n", gw->l1e.l1, mfn_x(gw->l1mfn)); +#endif + gprintk(XENLOG_INFO, " pfec=%02x[%c%c%c%c%c%c]\n", gw->pfec, + gw->pfec & PFEC_prot_key ? 'K' : '-', + gw->pfec & PFEC_insn_fetch ? 'I' : 'd', + gw->pfec & PFEC_reserved_bit ? 'R' : '-', + gw->pfec & PFEC_user_mode ? 'U' : 's', + gw->pfec & PFEC_write_access ? 'W' : 'r', + gw->pfec & PFEC_page_present ? 'P' : '-' + ); } #endif /* _XEN_ASM_GUEST_PT_H */ diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h index 470d29d..bc189d1 100644 --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -679,7 +679,7 @@ int p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, extern void p2m_pt_init(struct p2m_domain *p2m); void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn, - p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc); + p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec); /* Debugging and auditing of the P2M code? */ #ifndef NDEBUG diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index 46faffc..bc5946b 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -314,9 +314,6 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); #define _PAGE_PSE_PAT _AC(0x1000,U) #define _PAGE_AVAIL_HIGH (_AC(0x7ff, U) << 12) #define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0) -/* non-architectural flags */ -#define _PAGE_PAGED 0x2000U -#define _PAGE_SHARED 0x4000U /* * Debug option: Ensure that granted mappings are not implicitly unmapped. diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index d3ba8ea..75632d9 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -73,10 +73,12 @@ #define PFEC_reserved_bit (_AC(1,U) << 3) #define PFEC_insn_fetch (_AC(1,U) << 4) #define PFEC_prot_key (_AC(1,U) << 5) +#define PFEC_arch_mask (_AC(0xffff,U)) /* Architectural PFEC values. */ /* Internally used only flags. */ #define PFEC_page_paged (1U<<16) #define PFEC_page_shared (1U<<17) #define PFEC_implicit (1U<<18) /* Pagewalk input for ldt/gdt/idt/tr accesses. */ +#define PFEC_synth_mask (~PFEC_arch_mask) /* Synthetic PFEC values. */ /* Other exception error code values. */ #define X86_XEC_EXT (_AC(1,U) << 0) diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h index 5a613bc..1a6cae6 100644 --- a/xen/include/asm-x86/x86_64/page.h +++ b/xen/include/asm-x86/x86_64/page.h @@ -152,12 +152,6 @@ typedef l4_pgentry_t root_pgentry_t; #define _PAGE_GNTTAB (1U<<22) /* - * Bit 24 of a 24-bit flag mask! This is not any bit of a real pte, - * and is only used for signalling in variables that contain flags. - */ -#define _PAGE_INVALID_BIT (1U<<24) - -/* * Bit 12 of a 24-bit flag mask. This corresponds to bit 52 of a pte. * This is needed to distinguish between user and kernel PTEs since _PAGE_USER * is asserted for both. -- 2.1.4 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org https://lists.xen.org/xen-devel