On Thu, Jun 21, 2012 at 8:29 AM, Gleb Natapov <[email protected]> wrote:
> On Fri, Jun 15, 2012 at 03:08:22PM -0400, Christoffer Dall wrote:
>> From: Christoffer Dall <[email protected]>
>>
>> This commit introduces the framework for guest memory management
>> through the use of 2nd stage translation. Each VM has a pointer
>> to a level-1 table (the pgd field in struct kvm_arch) which is
>> used for the 2nd stage translations. Entries are added when handling
>> guest faults (later patch) and the table itself can be allocated and
>> freed through the following functions implemented in
>> arch/arm/kvm/arm_mmu.c:
>> - kvm_alloc_stage2_pgd(struct kvm *kvm);
>> - kvm_free_stage2_pgd(struct kvm *kvm);
>>
>> Further, each entry in TLBs and caches are tagged with a VMID
>> identifier in addition to ASIDs. The VMIDs are assigned consecutively
>> to VMs in the order that VMs are executed, and caches and tlbs are
>> invalidated when the VMID space has been used to allow for more than
>> 255 simultaenously running guests.
>>
>> The 2nd stage pgd is allocated in kvm_arch_init_vm(). The table is
>> freed in kvm_arch_destroy_vm(). Both functions are called from the main
>> KVM code.
>>
>> Signed-off-by: Christoffer Dall <[email protected]>
>> ---
>> arch/arm/include/asm/kvm_arm.h | 2 -
>> arch/arm/include/asm/kvm_mmu.h | 5 ++
>> arch/arm/kvm/arm.c | 65 ++++++++++++++++++++++---
>> arch/arm/kvm/mmu.c | 103
>> ++++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 166 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
>> index 7f30cbd..257242f 100644
>> --- a/arch/arm/include/asm/kvm_arm.h
>> +++ b/arch/arm/include/asm/kvm_arm.h
>> @@ -62,7 +62,7 @@
>> * SWIO: Turn set/way invalidates into set/way clean+invalidate
>> */
>> #define HCR_GUEST_MASK (HCR_TSC | HCR_TWI | HCR_VM | HCR_BSU_IS | HCR_FB | \
>> - HCR_AMO | HCR_IMO | HCR_FMO | HCR_FMO | HCR_SWIO)
>> + HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | HCR_SWIO)
>>
>> /* Hyp System Control Register (HSCTLR) bits */
>> #define HSCTLR_TE (1 << 30)
>> diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
>> index 1aa1af4..d95662eb 100644
>> --- a/arch/arm/include/asm/kvm_mmu.h
>> +++ b/arch/arm/include/asm/kvm_mmu.h
>> @@ -34,4 +34,9 @@ int kvm_hyp_pgd_alloc(void);
>> pgd_t *kvm_hyp_pgd_get(void);
>> void kvm_hyp_pgd_free(void);
>>
>> +int kvm_alloc_stage2_pgd(struct kvm *kvm);
>> +void kvm_free_stage2_pgd(struct kvm *kvm);
>> +
>> +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
>> +
>> #endif /* __ARM_KVM_MMU_H__ */
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index efe130c..81babe9 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -38,6 +38,13 @@
>>
>> static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>>
>> +/* The VMID used in the VTTBR */
>> +#define VMID_BITS 8
>> +#define VMID_MASK ((1 << VMID_BITS) - 1)
>> +#define VMID_FIRST_GENERATION (1 << VMID_BITS)
>> +static u64 next_vmid; /* The next available VMID in the
>> sequence */
>> +DEFINE_SPINLOCK(kvm_vmid_lock);
>> +
>> int kvm_arch_hardware_enable(void *garbage)
>> {
>> return 0;
>> @@ -70,14 +77,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
>> {
>> }
>>
>> -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>> -{
>> - if (type)
>> - return -EINVAL;
>> -
>> - return 0;
>> -}
>> -
>> int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
>> {
>> return VM_FAULT_SIGBUS;
>> @@ -93,10 +92,46 @@ int kvm_arch_create_memslot(struct kvm_memory_slot
>> *slot, unsigned long npages)
>> return 0;
>> }
>>
>> +/**
>> + * kvm_arch_init_vm - initializes a VM data structure
>> + * @kvm: pointer to the KVM struct
>> + */
>> +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>> +{
>> + int ret = 0;
>> +
>> + if (type)
>> + return -EINVAL;
>> +
>> + ret = kvm_alloc_stage2_pgd(kvm);
>> + if (ret)
>> + goto out_fail_alloc;
>> + mutex_init(&kvm->arch.pgd_mutex);
>> +
>> + ret = create_hyp_mappings(kvm, kvm + 1);
>> + if (ret)
>> + goto out_free_stage2_pgd;
>> +
>> + /* Mark the initial VMID invalid */
>> + kvm->arch.vmid = 0;
>> +
>> + return ret;
>> +out_free_stage2_pgd:
>> + kvm_free_stage2_pgd(kvm);
>> +out_fail_alloc:
>> + return ret;
>> +}
>> +
>> +/**
>> + * kvm_arch_destroy_vm - destroy the VM data structure
>> + * @kvm: pointer to the KVM struct
>> + */
>> void kvm_arch_destroy_vm(struct kvm *kvm)
>> {
>> int i;
>>
>> + kvm_free_stage2_pgd(kvm);
>> +
>> for (i = 0; i < KVM_MAX_VCPUS; ++i) {
>> if (kvm->vcpus[i]) {
>> kvm_arch_vcpu_free(kvm->vcpus[i]);
>> @@ -172,6 +207,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
>> unsigned int id)
>> if (err)
>> goto free_vcpu;
>>
>> + err = create_hyp_mappings(vcpu, vcpu + 1);
>> + if (err)
>> + goto free_vcpu;
>> +
>> return vcpu;
>> free_vcpu:
>> kmem_cache_free(kvm_vcpu_cache, vcpu);
>> @@ -181,6 +220,7 @@ out:
>>
>> void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
>> {
>> + kmem_cache_free(kvm_vcpu_cache, vcpu);
>> }
>>
>> void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>> @@ -416,6 +456,15 @@ int kvm_arch_init(void *opaque)
>> if (err)
>> goto out_err;
>>
>> + /*
>> + * The upper 56 bits of VMIDs are used to identify the generation
>> + * counter, so VMIDs initialized to 0, having generation == 0, will
>> + * never be considered valid and therefor a new VMID must always be
>> + * assigned. Whent he VMID generation rolls over, we start from
>> + * VMID_FIRST_GENERATION again.
>> + */
>> + next_vmid = VMID_FIRST_GENERATION;
>> +
>> return 0;
>> out_err:
>> return err;
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index a320b56a..b256540 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -159,6 +159,109 @@ out:
>> return err;
>> }
>>
>> +/**
>> + * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
>> + * @kvm: The KVM struct pointer for the VM.
>> + *
>> + * Allocates the 1st level table only of size defined by PGD2_ORDER (can
>> + * support either full 40-bit input addresses or limited to 32-bit input
>> + * addresses). Clears the allocated pages.
>> + */
>> +int kvm_alloc_stage2_pgd(struct kvm *kvm)
>> +{
>> + pgd_t *pgd;
>> +
>> + if (kvm->arch.pgd != NULL) {
>> + kvm_err("kvm_arch already initialized?\n");
>> + return -EINVAL;
>> + }
>> +
>> + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD2_ORDER);
>> + if (!pgd)
>> + return -ENOMEM;
>> +
>> + memset(pgd, 0, PTRS_PER_PGD2 * sizeof(pgd_t));
>> + kvm->arch.pgd = pgd;
>> +
>> + return 0;
>> +}
>> +
>> +static void free_guest_pages(pte_t *pte, unsigned long addr)
>> +{
>> + unsigned int i;
>> + struct page *page;
>> +
>> + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
> Hmm, "addr" is not used.
>
indeed it's not
>> + if (!pte_present(*pte))
>> + goto next_page;
> Why goto instead of:
>
historic reasons, thanks.
> if(pte_present(*pte)) {
>> + page = pfn_to_page(pte_pfn(*pte));
>> + put_page(page);
> }
>
>> +next_page:
>> + pte++;
>> + }
>> +}
>
>> +
>> +static void free_stage2_ptes(pmd_t *pmd, unsigned long addr)
>> +{
>> + unsigned int i;
>> + pte_t *pte;
>> + struct page *page;
>> +
>> + for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
>> + BUG_ON(pmd_sect(*pmd));
>> + if (!pmd_none(*pmd) && pmd_table(*pmd)) {
>> + pte = pte_offset_kernel(pmd, addr);
>> + free_guest_pages(pte, addr);
>> + page = virt_to_page((void *)pte);
>> + WARN_ON(atomic_read(&page->_count) != 1);
>> + pte_free_kernel(NULL, pte);
>> + }
>> + pmd++;
>> + }
>> +}
>> +
>> +/**
>> + * kvm_free_stage2_pgd - free all stage-2 tables
>> + * @kvm: The KVM struct pointer for the VM.
>> + *
>> + * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
>> + * underlying level-2 and level-3 tables before freeing the actual level-1
>> table
>> + * and setting the struct pointer to NULL.
>> + */
>> +void kvm_free_stage2_pgd(struct kvm *kvm)
>> +{
>> + pgd_t *pgd;
>> + pud_t *pud;
>> + pmd_t *pmd;
>> + unsigned long long i, addr;
>> +
>> + if (kvm->arch.pgd == NULL)
>> + return;
>> +
>> + /*
>> + * We do this slightly different than other places, since we need more
>> + * than 32 bits and for instance pgd_addr_end converts to unsigned
>> long.
>> + */
>> + addr = 0;
>> + for (i = 0; i < PTRS_PER_PGD2; i++) {
>> + addr = i * (unsigned long long)PGDIR_SIZE;
>> + pgd = kvm->arch.pgd + i;
>> + pud = pud_offset(pgd, addr);
>> +
>> + if (pud_none(*pud))
>> + continue;
>> +
>> + BUG_ON(pud_bad(*pud));
>> +
>> + pmd = pmd_offset(pud, addr);
>> + free_stage2_ptes(pmd, addr);
>> + pmd_free(NULL, pmd);
>> + }
>> +
>> + free_pages((unsigned long)kvm->arch.pgd, PGD2_ORDER);
>> + kvm->arch.pgd = NULL;
>> +}
>> +
>> int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>> {
>> return -EINVAL;
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html